This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
Should include all data files:
library(dplyr)
library(vegan)
library(pvclust)
library(corrplot)
library(GGally)
library(gclus)
library(rcompanion)
library(ggplot2)
library(ape)
library(adespatial)
library(ade4)
library(knitr)
library(ridgeline)
library(ggridges)
library(mgcv)
library(viridis)
library(tidyverse)
library(tidyr)
library(readxl)
library(geosphere)
library(lubridate)
library(kableExtra)
library(splines)
library(ggpubr)
library(ggrepel)
#remotes::install_github("MikkoVihtakari/ggOceanMaps")
library(ggOceanMaps)
#expanded data - DATASET A
data.a <- read.csv("./expanded_data.csv", sep = ",", header=TRUE) #extracted from raw data where counts are translated into rows - This is also called Dataset A
# wide data - DATASET B
data.b <- read.csv("./wide_data.csv", sep = ",", header=TRUE) #this file is wide data plus all the env data that I extracted manually - This is also called Dataset B, and will become dataset C when aggregated by site
#Load taxa sheet file - TAXA
taxa <- read.csv("./taxa_sheet_all.csv", sep = ",", header=TRUE) #all morphotypes and their metadata
#Load the forest data
garden_length_df <- read.csv("./forest_data.csv", sep = ",", header=TRUE)
Explanation: Dive_Name is the code for each EX dive. Here the correct names of each location. In some plots only the numbers will be shown, so here is the reference list:
- 2304_02 = Big Bend
- 2304_05 = Lone Knoll
- 2304_07 = Uliaga
- 2304_08 = Umnak
- 2306_01 = Kodiak
- 2306_03 = Giacomini
- 2306_04 = Quinn
- 2306_05 = Surveyor
- 2306_06 = Durgin
- 2306_07 = Deep Discover
- 2306_08 = Denson
- 2306_12 = Noyes
- 2306_14 = Chatham
- 2306_15 = Middleton
- 2306_18 = Gumpy
Before starting any data manipulation, getting some data summaries and basic statistics
Here I calculate the basics, like how many taxa, how many observations etc. This is done before aggregating the data into depth bins, as the data is still complete here and I can work with the rows. This uses the full expanded dataset (Dataset A) and the taxa sheet.
#How many total morphotypes?
nrow(taxa)
## [1] 164
#How many total coral morphotypes?
sum(taxa$Phylum == "Cnidaria")
## [1] 74
#How many total sponge morphotypes?
sum(taxa$Phylum == "Porifera")
## [1] 90
#How many total observations?
nrow(data.a) #counting all rows of the Dataset A
## [1] 15531
#How many coral observations?
sum(data.a$Phylum == "Cnidaria")
## [1] 7317
#How many sponge observations?
sum(data.a$Phylum == "Porifera")
## [1] 8214
#How many coral observations of species level?
sum(data.a$Phylum == "Cnidaria" & data.a$Species != "")
## [1] 4056
#How many sponge observations of species level?
sum(data.a$Phylum == "Porifera" & data.a$Species != "")
## [1] 997
#How many (and which) morphotypes occurred in a single dive only?
# First, aggregate the data to count the occurrences of each morphotype across all dives
morphotype_counts <- data.a %>%
group_by(Morphotype) %>%
summarise(Count = n_distinct(Dive.Name))
# Filter the aggregated data to include only morphotypes that occurred in a single dive
morphotypes_single_dive <- morphotype_counts %>%
filter(Count == 1) #change number for what to check
# Count the number of morphotypes that occurred in a single dive
num_single_dive_morphotypes <- nrow(morphotypes_single_dive)
# Print the number of morphotypes that occurred in a single dive
print(num_single_dive_morphotypes)
## [1] 103
# Frequency of this number:
(100/(length(unique(data.a$Morphotype))))*num_single_dive_morphotypes
## [1] 62.80488
# Can print the morphotypes that occurred in a single dive only:
#knitr::kable(morphotypes_single_dive, caption = "Morphotypes that occures in a single dive location only")
Now I want to plot density (number of counts) by depth and divided for each phylum. Since some depths have been visited several times, I am scaling the counts by the duration of each dive, which results in an effort-corrected count. This way there is no bias on the density, e.g. if more time spent in 700 m because there were 2 dives. The plot also gives an overview of the highest abundances, and how the abundances change over depth.
# Find the effort of time by depth
# Calculate duration for each dive
# Convert Start.Date and End.Date to POSIXct datetime format
data.a <- data.a %>%
mutate(
Start.Time = as.POSIXct(Start.Date, format = "%Y%m%dT%H%M%S", tz = "UTC"),
End.Time = as.POSIXct(End.Date, format = "%Y%m%dT%H%M%S", tz = "UTC")
)
# Calculate duration for each dive
dive_duration <- data.a %>%
dplyr::group_by(Dive.Name) %>%
dplyr::summarize(Duration = as.numeric(difftime(max(End.Time), min(Start.Time), units = "secs")), .groups = "drop")
knitr::kable(dive_duration, caption = "Duration by Dive site (min)")
| Dive.Name | Duration |
|---|---|
| EX2304_DIVE02 Big Bend Canyon | 18020 |
| EX2304_DIVE05 Lone Knoll Scarp | 9490 |
| EX2304_DIVE07 Uliaga Mound Take 2 | 23204 |
| EX2304_DIVE08 Umnak Canyon | 8921 |
| EX2306_Dive01 Kodiak Shelf | 15205 |
| EX2306_Dive03 Giacomini Seamount | 21482 |
| EX2306_Dive04 Quinn Seamount | 14921 |
| EX2306_Dive05 Surveyor Seamount | 24769 |
| EX2306_Dive06 Durgin Guyot | 23065 |
| EX2306_Dive07 Deep Discover Dome | 14308 |
| EX2306_Dive08 Denson Seamount | 22300 |
| EX2306_Dive12 Noyes Canyon | 19411 |
| EX2306_Dive14 Chatham Seep | 22549 |
| EX2306_Dive15 Middleton Canyon | 17229 |
| EX2306_Dive18 Gumby Ridge | 15838 |
dive_duration <- dive_duration %>%
mutate(scaled_time = scale(Duration)) #will be in seconds
# Count occurrences per Depth and Phylum
df_summary <- data.a %>%
group_by(Depth..m., Phylum, Dive.Name) %>%
summarise(Count = n(), .groups = "drop")
# Merge dive durations with df_summary
df_corrected <- df_summary %>%
left_join(dive_duration, by = "Dive.Name") %>%
mutate(Duration_hr = as.numeric(Duration) / 3600, # Convert seconds to hours
Effort_Corrected_Count = Count / Duration_hr) # Normalize counts by hours
# Plot density with effort correction
ggplot(df_corrected, aes(x = Depth..m., fill = Phylum, weight = Effort_Corrected_Count)) +
geom_density(alpha = 0.7) + # Density plot with transparency
scale_fill_manual(values = c("Cnidaria" = "#d8514e", "Porifera" = "#f2c714")) + # Custom colors
scale_x_reverse(breaks=seq(400, 3200, 200)) + # Flip depth axis so shallowest is on top
coord_flip() +
labs(x = "Depth (m)", y = "Effort-Corrected Density", fill = "Phylum") +
theme_minimal()
#plot without effort correction
ggplot(data.a, aes(x = Depth..m., fill = Phylum)) +
geom_density(alpha = 0.7) + # Adjust transparency
scale_fill_manual(values = c("Cnidaria" = "#d8514e", "Porifera" = "#f2c714")) +
labs(x = "Depth (m)", y = "Density", fill = "Phylum") +
theme_minimal() +
coord_flip() +
scale_x_reverse(breaks=seq(400, 3200, 200))
# Bin the df into depth for a test on the abundances by depth - bins in 5 meter, but can be adapted.
df_corrected <- df_corrected %>%
mutate(depth_bin = cut(Depth..m., breaks = seq(0, max(Depth..m.), by = 5), include.lowest = TRUE)) %>%
mutate(depth_bin = as.factor(depth_bin))
# Is there a linear effect?
glm_model <- glm(Effort_Corrected_Count ~ Depth..m., data = df_corrected, family = quasipoisson) # need to use quasipoisson because its non-integer counts with the correction
summary(glm_model) # No
##
## Call:
## glm(formula = Effort_Corrected_Count ~ Depth..m., family = quasipoisson,
## data = df_corrected)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.576e-02 6.598e-02 -0.997 0.319
## Depth..m. 4.992e-05 4.067e-05 1.228 0.220
##
## (Dispersion parameter for quasipoisson family taken to be 2.974241)
##
## Null deviance: 4390.0 on 2802 degrees of freedom
## Residual deviance: 4385.6 on 2801 degrees of freedom
## AIC: NA
##
## Number of Fisher Scoring iterations: 5
glm_numeric <- glm(Effort_Corrected_Count ~ as.numeric(Depth..m.), data = df_corrected, family = quasipoisson)
summary(glm_numeric) # No
##
## Call:
## glm(formula = Effort_Corrected_Count ~ as.numeric(Depth..m.),
## family = quasipoisson, data = df_corrected)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.576e-02 6.598e-02 -0.997 0.319
## as.numeric(Depth..m.) 4.992e-05 4.067e-05 1.228 0.220
##
## (Dispersion parameter for quasipoisson family taken to be 2.974241)
##
## Null deviance: 4390.0 on 2802 degrees of freedom
## Residual deviance: 4385.6 on 2801 degrees of freedom
## AIC: NA
##
## Number of Fisher Scoring iterations: 5
# Is there a non-linear effect?
gam_model <- gam(Effort_Corrected_Count ~ s(Depth..m.), data = df_corrected, family = quasipoisson)
summary(gam_model) # yes, significant, meaning that there are peaks with higher abundances, but no linear decline.
##
## Family: quasipoisson
## Link function: log
##
## Formula:
## Effort_Corrected_Count ~ s(Depth..m.)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.12945 0.03387 -3.822 0.000135 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(Depth..m.) 8.888 8.996 21.97 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.058 Deviance explained = 13.6%
## GCV = 1.3632 Scale est. = 2.3104 n = 2803
plot(gam_model)
# Is there an overall depth effect?
anova(glm_model, test = "Chisq") # no
## Analysis of Deviance Table
##
## Model: quasipoisson, link: log
##
## Response: Effort_Corrected_Count
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 2802 4390.0
## Depth..m. 1 4.4372 2801 4385.6 0.2219
# And if we use the non-corrected data?
glm_model <- glm(Count ~ Depth..m., data = df_summary, family = poisson)
summary(glm_model) # Poisson - this is significant, but might be biased because of the sampling effort
##
## Call:
## glm(formula = Count ~ Depth..m., family = poisson, data = df_summary)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.9405898 0.0159777 121.46 <2e-16 ***
## Depth..m. -0.0001719 0.0000108 -15.92 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 24165 on 2802 degrees of freedom
## Residual deviance: 23902 on 2801 degrees of freedom
## AIC: 32126
##
## Number of Fisher Scoring iterations: 6
In geom_density(), the density represents a smoothed estimate of the distribution of observations across depth. It’s similar to a histogram but smoothed into a continuous curve. The uncorrected plot could over-represent depths where more time was spent during each dive, also overlapping dives. The effort-corrected plot gives a better idea of relative abundance per unit effort, making it more comparable across depths.
The statistical tests on depth effect on abundances shows a complex situation. But there is not a linear trend, that is abundances are not decreasing with depth, but that there are peaks. From the GAM model there are three large peaks (this could also be because of the sites…), but the middle peak is the lowest.
Continuing with morphotypes per depth, as overview of how many morphotyes were seen across the depth.
# Count unique morphotypes per depth for each phylum
morpho_diversity <- data.a %>%
group_by(Depth..m., Phylum) %>% # by each depth now, but can also be done with depth bin, replacing than Depth..m. with depth_bin
summarise(Unique_Morphotypes = n_distinct(Morphotype), .groups = "drop")
df_corrected <- df_summary %>%
left_join(dive_duration, by = "Dive.Name") %>%
mutate(Duration_hr = as.numeric(Duration) / 3600, # Convert seconds to hours
Effort_Corrected_Count = Count / Duration_hr) # Normalize counts by hours
# Step 2: Merge with df_corrected (Ensuring Consistency)
df_corrected <- df_corrected %>%
left_join(morpho_diversity, by = c("Depth..m.", "Phylum"))
df_corrected <- df_corrected %>%
mutate(Effort_Corrected_Div = Unique_Morphotypes / Duration_hr) # Normalize counts by hours
ggplot(df_corrected, aes(x = Depth..m., fill = Phylum, weight = Effort_Corrected_Count)) +
#geom_density(alpha = 0.7) + # Density plot with transparency
geom_smooth(aes(y = Effort_Corrected_Div, color = Phylum), method = "loess", se = TRUE) +
scale_fill_manual(values = c("Cnidaria" = "#d8514e", "Porifera" = "#f2c714")) + # Custom color
coord_flip() +
labs(x = "Depth (m)", y = "Effort-Corrected Density", fill = "Phylum") +
theme_minimal() +
scale_x_reverse(breaks=seq(400, 3200, 200))
## `geom_smooth()` using formula = 'y ~ x'
The morphotype density shows that most morphotypes were observed in shallower depths, but a continous diversity below the OMZ. Curiuosly when using the corrected data, it seems as corals are more abundant and more diverse, compared to sponges. This could be due to the effort-correction.
## Extract metadata for each dive - use dataset B
meta_summary <- data.b[,169:181] #select only the columns that have the environmental data, exclude any morphotype data
meta_summary$Group <- data.b$Dive.Name
meta_summary <- meta_summary %>%
group_by(Group) %>%
summarise_all(list(~ mean(., na.rm = TRUE), ~ sd(., na.rm = TRUE))) # Compute mean and sd
kable(meta_summary) %>%
kable_styling(font_size = 5, latex_options = c("striped"))
| Group | lon_mean | oxygen_mean | depth_mean | temp_mean | aspect_mean | aspect_rad_mean | eastness_mean | northness_mean | rugosity_mean | slope_mean | bs_mean | feature_mean | sed_type_mean | lon_sd | oxygen_sd | depth_sd | temp_sd | aspect_sd | aspect_rad_sd | eastness_sd | northness_sd | rugosity_sd | slope_sd | bs_sd | feature_sd | sed_type_sd |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2304_02 | -156.5281 | 2.1910415 | 2234.8660 | 1.828801 | 99.62525 | 1.7387886 | 0.4664148 | 0.0157865 | 8.627509 | 37.614612 | 125.86228 | 1 | 2.904192 | 0.0009742 | 0.0652827 | 49.99673 | 0.0235020 | 70.3430048 | 1.2277170 | 0.3543062 | 0.8132572 | 6.469064 | 8.041973 | 20.900198 | 0 | 0.6875268 |
| 2304_05 | -164.1699 | 3.1930806 | 2780.7085 | 1.610589 | 277.59723 | 4.8449856 | -0.1471961 | 0.4206298 | 29.204248 | 26.247497 | 30.25000 | 5 | 2.750000 | 0.0009851 | 0.0561942 | 22.62232 | 0.0076712 | 94.7809943 | 1.6542404 | 0.3946594 | 0.8361069 | 11.377150 | 9.810891 | 50.651094 | 0 | 0.4472136 |
| 2304_07 | -169.7784 | 0.9301650 | 653.8028 | 3.498381 | 125.76859 | 2.1950760 | 0.0641047 | -0.1494093 | 13.853232 | 40.198865 | 76.10465 | 3 | 5.000000 | 0.0012845 | 0.0830034 | 70.81022 | 0.0606151 | 84.8186776 | 1.4803652 | 0.5484088 | 0.8219816 | 7.528911 | 5.314185 | 84.358706 | 0 | 0.0000000 |
| 2304_08 | -168.8173 | 1.0963850 | 1460.3310 | 2.268787 | 209.17722 | 3.6508312 | -0.4743283 | -0.8509267 | 8.857124 | 15.661749 | 50.75000 | 1 | 1.000000 | 0.0007677 | 0.0352930 | 11.34952 | 0.0266447 | 13.6620204 | 0.2384472 | 0.2000582 | 0.1246746 | 2.955730 | 2.718459 | 6.510481 | 0 | 0.0000000 |
| 2306_01 | -152.9192 | 3.4617127 | 3030.8534 | 1.561238 | 23.70798 | 0.4137823 | 0.3944582 | 0.9013083 | 76.258460 | 17.336482 | 84.34667 | 5 | 4.360000 | 0.0005010 | 0.0566058 | 40.13850 | 0.0061754 | 10.4068466 | 0.1816337 | 0.1556529 | 0.0908370 | 16.909731 | 4.310947 | 20.340011 | 0 | 1.4762992 |
| 2306_03 | -146.3149 | 0.5129225 | 805.2009 | 3.120205 | 40.24966 | 0.7024891 | 0.6451784 | 0.7620911 | 50.542931 | 21.234674 | 153.23497 | 3 | 5.295082 | 0.0019727 | 0.0204957 | 44.00440 | 0.0944003 | 3.1251338 | 0.0545439 | 0.0423481 | 0.0343036 | 20.994250 | 8.112470 | 12.886841 | 0 | 0.4567039 |
| 2306_04 | -145.1337 | 1.6749253 | 1947.3367 | 1.971742 | 125.67085 | 2.1933702 | 0.7595301 | -0.5538567 | 61.284492 | 25.333441 | 127.41176 | 3 | 5.000000 | 0.0021964 | 0.1254428 | 54.19895 | 0.0453028 | 20.3086824 | 0.3544534 | 0.1563863 | 0.3060012 | 8.686628 | 3.112950 | 14.455526 | 0 | 0.0000000 |
| 2306_05 | -144.3024 | 0.6546702 | 507.0881 | 3.778363 | 25.28822 | 0.4413627 | 0.3360411 | 0.9251788 | 62.130555 | 28.748697 | 30.26341 | 3 | 4.668293 | 0.0018735 | 0.0571162 | 72.49600 | 0.0618734 | 40.8472395 | 0.7129188 | 0.1582382 | 0.0789545 | 12.422581 | 6.355581 | 36.553062 | 0 | 0.6240440 |
| 2306_06 | -141.7489 | 0.5309211 | 1102.1208 | 2.849579 | 123.98051 | 2.1638681 | 0.8124468 | -0.5473260 | 72.130593 | 31.312636 | 157.77679 | 3 | 5.000000 | 0.0015295 | 0.0473178 | 68.69183 | 0.1059867 | 11.6258698 | 0.2029097 | 0.1156312 | 0.1648482 | 17.508557 | 7.242374 | 17.900177 | 0 | 0.0000000 |
| 2306_07 | -140.8336 | 3.5516965 | 3224.2976 | 1.546710 | 44.42222 | 0.7753139 | 0.6924042 | 0.7069126 | 86.726845 | 18.455157 | 73.56835 | 3 | 4.424460 | 0.0007628 | 0.0446158 | 33.79797 | 0.0060338 | 8.3553094 | 0.1458277 | 0.0995570 | 0.1053120 | 9.216274 | 2.029851 | 11.127750 | 0 | 1.4089693 |
| 2306_08 | -137.3825 | 0.6443063 | 1321.5616 | 2.571358 | 24.32856 | 0.4246134 | 0.4086943 | 0.9044223 | 67.050512 | 21.405161 | NaN | 3 | 5.108696 | 0.0017920 | 0.0293136 | 52.40034 | 0.0358585 | 7.0567024 | 0.1231627 | 0.1097229 | 0.0550647 | 35.784204 | 9.636119 | NA | 0 | 0.7459401 |
| 2306_12 | -134.5180 | 1.0701339 | 1543.5204 | 2.321652 | 341.88219 | 5.9669699 | -0.3109539 | 0.9503646 | 151.019993 | 33.012836 | 79.66327 | 1 | 3.714286 | 0.0010175 | 0.1196809 | 52.27767 | 0.0918113 | 0.6168777 | 0.0107665 | 0.0102285 | 0.0033603 | 19.277169 | 4.302193 | 11.514172 | 0 | 1.4713764 |
| 2306_14 | -135.4970 | 0.6271747 | 713.7097 | 4.102332 | 192.73962 | 3.3639410 | -0.1583710 | -0.8414947 | 36.632457 | 8.602384 | 167.80556 | 4 | 4.631944 | 0.0018600 | 0.0231609 | 8.67190 | 0.0355006 | 31.9745955 | 0.5580620 | 0.4399844 | 0.2740161 | 16.161008 | 4.730903 | 16.551795 | 0 | 2.9130680 |
| 2306_15 | -145.7137 | 2.1287497 | 2108.7174 | 1.819657 | 214.31090 | 3.7404308 | -0.5635357 | -0.8257710 | 175.601596 | 33.961758 | 150.10169 | 1 | 4.335593 | 0.0010582 | 0.0802843 | 57.57299 | 0.0196996 | 1.3210700 | 0.0230570 | 0.0192034 | 0.0127559 | 27.263599 | 4.340210 | 13.588830 | 0 | 1.0844859 |
| 2306_18 | -147.0119 | 1.3704871 | 1795.3608 | 2.096349 | 343.45519 | 5.9944239 | -0.2607236 | 0.9182708 | 106.413534 | 24.903195 | 150.02439 | 2 | 2.849594 | 0.0005074 | 0.1014007 | 37.82639 | 0.0442920 | 17.8050098 | 0.3107560 | 0.2352109 | 0.1839571 | 16.345194 | 7.434391 | 14.057877 | 0 | 0.3908910 |
# What are max/min values for depth, oxygen etc?
max(data.a$Depth..m.)
## [1] 3284.324
min(data.a$Depth..m.)
## [1] 380.2322
max(data.a$Oxygen.Concentration..mg.l.)
## [1] 3.67951
min(data.a$Oxygen.Concentration..mg.l.)
## [1] 0.45809
max(data.a$Salinity..psu.)
## [1] 34.66219
min(data.a$Salinity..psu.)
## [1] 34.07224
# Threshold values for OMZ: <1ml/L or hypoxic <0.5 ml/L - need to calculate in mg/L to make it comparable:
# Conversion factor × 1.429
print(omz <- 1*1.429) # 1.429 is the threshold for mg/L
## [1] 1.429
print(hypoxic <- 0.5*1.429) # 0.715 is the threshold for a severe hypoxic condition in mg/L
## [1] 0.7145
# Which depths are inside the OMZ?
within_omz <- data.a %>% filter(Oxygen.Concentration..mg.l. < omz)
min(within_omz$Depth..m.)
## [1] 380.2322
max(within_omz$Depth..m.)
## [1] 1846.175
Check if any of the environmental variables are correlated, this is important for the analysis, and can be used to exclude variables, simplifying the further analysis.
# Compute Pearson correlation matrix
env.pearson <- cor(data.a[,17:22], use = "pairwise.complete.obs")
round(env.pearson, 2)
## Latitude..deg. Longitude..deg.
## Latitude..deg. 1.00 0.63
## Longitude..deg. 0.63 1.00
## Oxygen.Concentration..mg.l. 0.17 0.07
## Temperature..C. -0.43 -0.21
## Depth..m. 0.38 0.29
## Salinity..psu. 0.54 0.42
## Oxygen.Concentration..mg.l. Temperature..C.
## Latitude..deg. 0.17 -0.43
## Longitude..deg. 0.07 -0.21
## Oxygen.Concentration..mg.l. 1.00 -0.81
## Temperature..C. -0.81 1.00
## Depth..m. 0.92 -0.94
## Salinity..psu. 0.75 -0.97
## Depth..m. Salinity..psu.
## Latitude..deg. 0.38 0.54
## Longitude..deg. 0.29 0.42
## Oxygen.Concentration..mg.l. 0.92 0.75
## Temperature..C. -0.94 -0.97
## Depth..m. 1.00 0.94
## Salinity..psu. 0.94 1.00
# Pairplot with correlation coefficients
ggpairs(data.a[,17:22],
lower = list(continuous = wrap("smooth", color = "blue")), # Smoothed regression lines
upper = list(continuous = wrap("cor", size = 5)), # Pearson correlation values
diag = list(continuous = wrap("barDiag", fill = "lightblue"))) + # Histogram on diagonals
theme_bw()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Correlation plot
corrplot(env.pearson, method = "circle", type = "upper", tl.col = "black", tl.cex = 0.8, addCoef.col = "black")
Figure 2.
# Create the plot 1) TEMP vs Oxgyen
ggplot(data.a, aes(y = Temperature..C., x = Oxygen.Concentration..mg.l., color = Depth..m.)) +
geom_point(size = 1) + # Scatter plot with color representing depth
#scale_color_gradient(name = "Depth (m)", low = "blue", high = "red") + # Color gradient
scale_color_viridis(option = "viridis", direction = -1) + # Color scale with cividis for colorblind-friendly option
#scale_y_reverse() + # Reverse y-axis to put shallow depth on top
xlab("Oxygen (mg/L)") + # X-axis label
ylab("Temperature (°C)") + # Left y-axis label
theme_linedraw() +
theme(panel.grid = element_blank(),
legend.title = element_text(size = 10),
legend.text = element_text(size = 10),
axis.title.x = element_text (size = 14),
axis.title.y = element_text(size = 14),
legend.background = element_blank(),
legend.box.background= element_rect(colour="black"),
legend.position = "right")
# Create the plot 2) Salinity vs Temperature
ggplot(data.a, aes(y = Temperature..C., x = Salinity..psu., color = Depth..m.)) +
geom_point(size = 1) + # Scatter plot with color representing depth
#scale_color_gradient(name = "Depth (m)", low = "blue", high = "red") + # Color gradient
scale_color_viridis(option = "viridis", direction = -1) + # Color scale with cividis for colorblind-friendly option
#scale_y_reverse() + # Reverse y-axis to put shallow depth on top
xlab("Salinity (psu)") + # X-axis label
ylab("Temperature (°C)") + # Left y-axis label
theme_linedraw() +
theme(panel.grid = element_blank(),
legend.title = element_text(size = 10),
legend.text = element_text(size = 10),
axis.title.x = element_text (size = 14),
axis.title.y = element_text(size = 14),
legend.background = element_blank(),
legend.box.background= element_rect(colour="black"),
legend.position = "right")
The observations are binned into 5 meter depth bins to simplify and aggregate data and reduce computation time. Otherwise, some analyses take a long time. Here we will be using the Dataset B - the wide data.
depth_bins <- seq(0, max(data.b$depth, na.rm = TRUE)+100, by = 5) # in 5 meter bins, indicated by=5
depth_labels <- paste0(depth_bins[-length(depth_bins)] + 1, "-", depth_bins[-1]) #add labels for each depth bin
# Add depth bin column to the dataframe
data.b <- data.b %>%
mutate(depth_bin = cut(depth, breaks = depth_bins, labels = depth_labels, include.lowest = TRUE))
# Aggregate species data by Dive.Name and Depth_bin
df_species_aggregated <- data.b %>%
group_by(Dive.Name, depth_bin) %>% #Dive.Name is the column that indiciates the site/location (E.g. EX2304_07 is Uliaga seamount)
summarise(across(3:166, sum, .names = "{.col}"), .groups = "drop") #across all columns with taxa
# Aggregate environmental variables by Dive.Name and Depth_bin
# Function to calculate the mode, for this numeric value it takes the mean
get_mode <- function(x) {
x <- na.omit(x)
if(length(x) == 0) return(NA)
uniq_vals <- unique(x)
uniq_vals[which.max(tabulate(match(x, uniq_vals)))]}
# Ensure lat/lon are numeric before aggregation, otherwise there might be an error
data.b$lat <- as.numeric(data.b$lat)
data.b$lon <- as.numeric(data.b$lon)
#Aggregating with mean for numeric columns and most common for character columns
df_env_aggregated <- data.b %>%
group_by(Dive.Name, depth_bin) %>%
summarise(across(167:179, mean, na.rm = TRUE), # Numeric columns - here might appear an error in newer dyplr versions, but does the same thing!
sed_type = get_mode(sed_type), #since this is a character, we need to use the primary sediment type (most common one)
sed_consistency = get_mode(sed_consistency), #the same here
.groups = "drop")
# Merge species and environmental data
data.b <- left_join(df_species_aggregated, df_env_aggregated, by = c("Dive.Name", "depth_bin"))
#OBS for Dive 2306-08 its NA because backscatter (bs) had no values due to technical errors during the dive
#Add the "water mass and/or current" to the dataset
data.b <- data.b %>%
mutate(watermass = case_when(
grepl("2306_04|2306_08", Dive.Name) ~ "PDW", #PDW (without LCDW)
grepl("2306_03|2306_05|2306_06", Dive.Name) ~ "PSIW", #Gulf of Alaska Gyre (Alaska current + eddies) = PSIW water mass
grepl("2304_07|2304_08", Dive.Name) ~ "ANSC" , #ANSC + Samalga Pass
grepl("2306_12|2306_14|2306_15|2306_18", Dive.Name) ~ "ACC" , #ACC Alaska coastal current, on the shelf
grepl("2304_02|2304_05|2306_01|2306_07", Dive.Name) ~ "PDW/LCDW" #PDW/LCDW water mass - deepest locations too
))
#write.xlsx(data.b, "aggregated_data.xlsx") #to check the df if needed
#Extract individual df for coordinates, species data and env data
#coord <-data.b[,167:168] #select the columns that have the lat/lon
coord <- data.frame(
lat = round(data.b[, 167], 6),
lon = round(data.b[, 168], 6))
ENV <-data.b[,169:181] #all the env data
ENV <- ENV[,-c(4:5)] #remove aspect and aspect_rad, not needed
SPE <-data.b[,3:166] #all annotation/taxa data
#write.xlsx(SPE, "SPE_data.xlsx") #to check the df if needed
In some cases, a dataset is needed where all data is aggregated by site. Here, I will call this Dataset C (short dataset) in contrast to Dataset B (full dataset, rows being annotation timestamps)
#Use Dive.Name to combine/summarise and create a new combined df
data.c <- data.b %>%
group_by(Dive.Name) %>%
summarise(
across(all_of(colnames(SPE)), sum, .names = "{.col}"), # Sum for SPE columns
sed_type = get_mode(sed_type), #since this is a character, we need to use the primary sediment type (most common one)
sed_consistency = get_mode(sed_consistency), #the same here
across(where(is.numeric) & all_of(colnames(ENV)), mean, na.rm = TRUE, .names = "{.col}"), # Mean for numeric ENV columns
across(all_of(colnames(coord)), first, .names = "{.col}"), # First for coord columns
.groups = "drop"
) %>%
as.data.frame()
data.c <- data.c %>%
mutate(watermass = case_when(
grepl("2306_04|2306_08", Dive.Name) ~ "PDW", #PDW (without LCDW)
grepl("2306_03|2306_05|2306_06", Dive.Name) ~ "PSIW", #Gulf of Alaska Gyre (Alaska current + eddies) = PSIW water mass
grepl("2304_07|2304_08", Dive.Name) ~ "ANSC" , #ANSC + Samalga Pass
grepl("2306_12|2306_14|2306_15|2306_18", Dive.Name) ~ "ACC" , #ACC Alaska coastal current, on the shelf
grepl("2304_02|2304_05|2306_01|2306_07", Dive.Name) ~ "PDW/LCDW" #PDW/LCDW water mass - deepest locations too
))
Now I want to create again SPE and ENV dfs from the combined dataset C
ENV2 <-data.c[,166:176] #all the env data
SPE2 <-data.c[,2:165] #all annotation/taxa data
#write.xlsx(SPE2, "SPE_data.xlsx") #to check the df if needed
Since there might be patterns in the family instead of morphotypes, the data is aggregated into family level.
# Merge taxa information with the column names of SPE, Define a function to replace NA values with the next available information, and replace empty strings with NA
taxa[taxa == ""] <- NA
fill_family <- function(df) {
df %>%
mutate(Family = ifelse(is.na(Family),
ifelse(!is.na(Class), Class, Phylum),
Family))}
taxa_filled <- fill_family(taxa)
column_names <- data.frame(Morphotype = colnames(SPE), stringsAsFactors = FALSE)
taxa_mapping <- taxa_filled %>%
dplyr::select(Morphotype, Family) %>%
dplyr::mutate(Family = as.factor(Family))
# Merge column names with taxa information
column_info <- merge(column_names, taxa_mapping, by = "Morphotype", all.x = TRUE)
# Group columns by family
grouped_columns <- column_info %>%
group_by(Family) %>%
summarise(Morphotype = list(Morphotype), .groups = 'drop')
# Initialize a new data frame for combined data
SPE.fam <- data.frame(matrix(ncol = nrow(grouped_columns), nrow = nrow(SPE)))
colnames(SPE.fam) <- grouped_columns$Family
# Combine columns by family
for (i in 1:nrow(grouped_columns)) {
family_columns <- grouped_columns$Morphotype[[i]]
# Ensure that the selected columns are treated as a data frame
selected_data <- SPE[ , family_columns, drop = FALSE]
# Sum columns belonging to the same family
SPE.fam[[i]] <- rowSums(selected_data, na.rm = TRUE)
}
# print to check
# SPE.fam
Here the morphotype data is explored in more detail. Which family most speciose and so on.
# Which is the most abundant morphotype?
# Sum the counts for each morphotype (column) across all locations (rows)
total_abundance <- colSums(SPE)
# Sort the families by total abundance in descending order
most_abundant_taxa <- sort(total_abundance, decreasing = TRUE)
# View the sorted list of most abundant taxa
most_abundant_taxa
## PD18 PD03 CAO36 CAO48 PH50 CAO18 CAH02 PD05 PH10 PH23 PD22 CAO51 CAO14
## 1185 1184 1093 696 652 593 549 543 525 524 505 500 387
## CAO54 CAO47 P06 CAO02 PD02 CAO16 PH12 PH51 CAO24 PH17 CAO49 PD19 CAO50
## 376 367 350 318 314 314 299 285 285 219 204 200 196
## PD32 CAO10 CAO13 CAO37 PH09 CAO15 CAO43 PD16 CAO62 PH40 PH47 PH48 PD21
## 159 152 147 146 130 123 118 88 83 82 79 75 67
## PH04 PH55 CAO27 CAH09 PH57 PD23 CAO19 CAO25 CAH06 CAO61 PD04 CAH10 CAO28
## 65 62 57 54 50 50 49 48 44 37 35 33 33
## PD07 CAO52 PH08 PD34 PD27 P02 P09 PH53 CAO58 PD14 CAO05 PH43 PD25
## 32 32 30 28 26 25 25 23 23 23 22 21 20
## CAO56 CAO04 PD08 CAO33 CAH19 PH41 CAH08 CAO29 PD28 PD13 CAH11 PH39 PH49
## 20 18 18 18 17 16 14 14 13 12 12 12 11
## CAO55 CAO65 PH35 PH05 PH33 CAO60 P04 PH32 CAO20 PH31 CAO08 PH54 CAO30
## 11 11 10 9 9 9 9 9 8 8 8 7 7
## CAH18 PH46 PH07 CAO26 PH38 PD29 PH14 PH27 CAH05 CAO09 PD24 P03 CAO45
## 7 6 6 6 6 5 5 5 4 4 4 4 4
## PH01 PH03 CAO06 PD31 CAO23 PH22 CAH12 CAO35 CAH16 CAO41 CAO07 CAO64 PH52
## 3 3 3 3 3 3 3 3 3 3 2 2 2
## PH06 CAO11 CAO59 PH18 PH20 PH24 CAO31 PH30 PD33 CAH14 CAH15 CAO44 PH58
## 2 2 2 2 2 2 2 2 2 2 2 2 2
## P01 CAO03 PH02 PD26 CAO17 PD30 PD35 PD09 CAO22 CAO57 PD06 PH15 PH16
## 1 1 1 1 1 1 1 1 1 1 1 1 1
## PH19 PH21 PH25 CAO63 CAO32 PH26 CAO46 PH28 PH29 P05 CAO34 CAO66 CAO67
## 1 1 1 1 1 1 1 1 1 1 1 1 1
## PD20 P07 CAO42 P08 PH36 PH37 PH42 CAO68
## 1 1 1 1 1 1 1 1
# Which is the most abundant family?
# Sum the counts for each morphotype (column) across all locations (rows)
total_abundance_fam <- colSums(SPE.fam)
most_abundant_family<- sort(total_abundance_fam, decreasing = TRUE)
most_abundant_family
## Cladorhizidae Primnoidae Corallidae Farreidae
## 3434 2347 1332 1188
## Balticinidae Demospongiae Rossellidae Keratoisididae
## 1150 943 837 806
## Schizopathidae Euplectellidae Porifera Plexauridae
## 732 648 391 348
## Aphrocallistidae Acanthogorgiidae Euretidae Octocorallia
## 310 206 206 196
## Anthoptilidae Vulcanellidae Hexactinellida Chrysogorgiidae
## 126 116 63 30
## Chalinidae Mycalidae Pheronematidae Alcyoniidae
## 28 26 24 20
## Cladopathidae Umbellulidae Pennatulidae Hexacorallia
## 10 6 4 2
## Kophobelemnidae
## 2
# Which taxa/families are depth generalists? Which are more specialised? (Occuring in only one depth e.g.)
# Combine species data with depth information
species_depth <- cbind(SPE.fam, Depth = ENV$depth)
# Count how many unique depths each morphotype appears in
species_depth_counts <- species_depth %>%
summarise(across(-Depth, ~length(unique(Depth[. > 0])))) %>%
t() %>%
as.data.frame()
colnames(species_depth_counts) <- "Unique_Depths"
species_depth_counts$Species <- rownames(species_depth_counts)
# View family with the most and least depth occurrences
species_depth_counts <- species_depth_counts[order(species_depth_counts$Unique_Depths), ]
# Define threshold (e.g., species in < 10 depths = specialist, otherwise generalist)
species_depth_counts$Category <- ifelse(species_depth_counts$Unique_Depths < 10, "Specialist", "Generalist")
# Convert SPE to long format for ggplot
species_long <- SPE.fam %>%
mutate(Depth = ENV$depth) %>%
pivot_longer(cols = -Depth, names_to = "Species", values_to = "Abundance") %>%
filter(Abundance > 0) # Remove absences
# Calculate depth range for each species
species_ranges <- species_long %>%
group_by(Species) %>%
summarise(MinDepth = min(Depth), MaxDepth = max(Depth)) %>%
mutate(DepthRange = MaxDepth - MinDepth,
Category = ifelse(DepthRange > 1000, "Generalist", "Specialist"))
# Merge category information with long species data
species_long <- species_long %>%
left_join(species_ranges, by = "Species")
# Plot with generalists highlighted
ggplot(species_long, aes(x = reorder(Species, Depth), y = Depth, color = Category)) +
geom_boxplot() +
coord_flip() +
scale_color_manual(values = c("Generalist" = "red", "Specialist" = "black")) +
labs(y = "Depth (m)", x = "Species",
title = "Species Depth Ranges",
color = "Category") +
theme_minimal()
First step is to go through each dive transect, and see where the individual counts are the highest. For this a simple histogram by depth is needed.
#the time is calculated in seconds
min <- 10
seconds <- min*60
divesites <- unique(data.a$Dive.Name)
plotList = vector( "list", length = 15)
for (i in 1:15){ # 15 dive sites
site <- c()
site[[ i ]] <- subset(data.a, Dive.Name == divesites[i]) # first create subset for each dive
plotList[[ i ]] = ggplot( site[[ i ]], aes(x=Start.Time, y = ..density..)) +
geom_histogram(binwidth= seconds, alpha=1.5) +
geom_density(size=1, color="blue")
}
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
print(plotList)
## [[1]]
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
##
## [[2]]
##
## [[3]]
##
## [[4]]
##
## [[5]]
##
## [[6]]
##
## [[7]]
##
## [[8]]
##
## [[9]]
##
## [[10]]
##
## [[11]]
##
## [[12]]
##
## [[13]]
##
## [[14]]
##
## [[15]]
This was analysed manually, inspecting the plot for high density peaks. The following decisions for taken: Dives with high densities: EX2304: 02 + 07. EX2306: 03 + 05 + 14 + 15 + 18 + Maybe 07? + Maybe 12? Those dives were inspected again, and the timing of the high densitites specified. More details can be found in the next table:
The following is a script to add data to the garden_density_df - as given in the supplementary file. Just to explain what has been done.
kable(garden_length_df) %>%
kable_styling(font_size = 7, latex_options = c("striped"))
| expedition | dive_number | dive_name | potential_garden | time_start | time_end | total.time | frame.grab.every.min | frame.grab.1 | frame.ID.1 | frame.grab.2 | frame.ID.2 | frame.grab.3 | frame.ID.3 | frame.grab.4 | frame.ID.4 | frame.grab.5 | frame.ID.5 | frame.grab.6 | frame.ID.6 | frame.grab.7 | frame.ID.7 | frame.grab.8 | frame.ID.8 | frame.grab.9 | frame.ID.9 | frame.grab.10 | frame.ID.10 | time_loc_1 | lon_loc_1 | lat_loc_1 | depth_loc_1 | time_loc_2 | lon_loc_2 | lat_loc_2 | depth_loc_2 | lasers_on | dominating_fauna | elevation | comment |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| EX2304 | 2 | 230402 | 2304-2A | 7/16/23 20:36 | 7/16/23 22:05 | 89.16667 | 8.92 | 20:36 | 1_203909 | 20:45 | 2_205111 | 20:54 | 3_205459 | 21:03 | 4_205735 | 21:12 | 5_211847 | 21:21 | 6_212126 | 21:30 | 7_213129 | 21:39 | 8_215013 | 9:48:00 PM (no good frame) | 9_213521 | 9:57:00 PM (no good frame) | 10_210645 | 7/16/23 20:36 | -156.5278 | 54.75417 | 2269.5956 | 7/16/23 22:05 | -156.5288 | 54.75486 | 2202.2417 | yes | sponges (PD03, PD02, PD05) | flat | sticks |
| EX2304 | 7 | 230407 | 2304-7A | 7/23/23 17:39 | 7/24/23 0:06 | 386.73333 | 38.67 | 17:39 | 1_174453 | 18:20 | 2_181239 | 19:00 | 3_185428 | 19:40 | 4_193557 | 20:20 | 5_201919 | 21:00 | 6_212438 | 21:40 | 7_215329 | 22:20 | 8_222640 | 23:00 | 9_230000 | 23:40 | 10_233941 | 7/23/23 17:39 | -169.7760 | 53.29347 | 777.2718 | 7/24/23 0:06 | -169.7804 | 53.29140 | 532.4322 | yes | sponge and coral | slight incline | diverse |
| EX2306 | 3 | 230603 | 2306-3A | 8/26/23 19:25 | 8/26/23 20:50 | 84.88333 | 8.49 | 19:30 | 1_193235 | 19:40 | 2_194145 | 19:50 | 3_nolasers_201343 | 20:00 | 4_200713 | 8:10:00 PM (no lasers) | 5_202350 | 20:20 | 6_203608 | 20:30 | 7_205053 | 20:40 | 8_193355 | 20:50 | 9_nolasers_201453 | 21:00 | 10_202527 | 8/26/23 19:25 | -146.3133 | 56.49172 | 838.7715 | 8/26/23 20:50 | -146.3151 | 56.49096 | 787.8098 | check | check video | ||
| EX2306 | 5 | 230605 | 2306-5A | 8/28/23 20:10 | 8/28/23 22:47 | 157.76667 | 15.78 | 20:10 | 1_201328 | 20:25 | 2_nolasers_202631 | 20:40 | 3_nolasers204228 | 8:55:00 PM (no lasers) | 4_213425 | 9:10:00 PM (no lasers) | 5_214745 | 9:25:00 PM (no lasers) | 6_225655 | 21:40 | 7_230032 | 21:55 | 8_201508 | 10:10:00 PM (changed to earlier as no lasers) | 9_232900 | 10:25:00 PM (changed to earlier as no frames) | 10_202353 | 8/28/23 20:10 | -144.3007 | 56.08964 | 579.7123 | 8/28/23 22:47 | -144.3043 | 56.08717 | 433.9817 | no lasers until 21:34 | |||
| EX2306 | 7 | 230607 | 2306-7A | 8/30/23 19:06 | 8/30/23 21:35 | 148.88333 | 14.89 | 19:06 | 1_190612 | 19:20 | 2_192047 | 7:45:00 PM (no lasers) | 3_211342 | 8:00:00 PM (no lasers) | 4_211700 | 8:15:00 PM (no lasers) | 5_213502 | 8:30:00 PM (no lasers) | 6_nolasers_200347 | 8:45:00 PM (no lasers) | 7_nolasers_201827 | 9:00:00 PM (no lasers) | 8_nolasers_203000 | 21:15 | 9_nolasers_205715 | 21:30 | 10_193037 | 8/30/23 19:06 | -140.8327 | 55.01326 | 3276.3970 | 8/30/23 21:35 | -140.8339 | 55.01235 | 3193.2476 | flat | |||
| EX2306 | 12 | 230612 | 2306-12A | 9/5/23 19:30 | 9/5/23 21:29 | 119.95000 | 12.00 | 19:30 | 1_193344 | 19:42 | 2_202339 | 19:54 | 3_203032 | 20:06 | 4_210207 | 20:12 | 5_210651 | 20:24 | 6_211730 | 20:36 | 7_213513 | 20:48 | 8_214117 | 21:00 | 9_nolasers_20000 | 21:12 | 10_223405 | 9/5/23 19:30 | -134.5189 | 55.03189 | 1585.8863 | 9/5/23 21:29 | -134.5176 | 55.03155 | 1532.3117 | flat | sticks | ||
| EX2306 | 15 | 230615 | 2306-15A | 9/10/23 20:00 | 9/10/23 23:15 | 195.00000 | 19.50 | 20:00 | EX2306_Dive15 Middleton Canyon_20230910T200340.000Z-001 | 20:20 | EX2306_Dive15 Middleton Canyon_20230910T202410.000Z-001 | 20:40 | EX2306_Dive15 Middleton Canyon_20230910T203735.000Z-001 | 21:00 | EX2306_Dive15 Middleton Canyon_20230910T205500.000Z-001 | 21:20 | EX2306_Dive15 Middleton Canyon_20230910T212710.000Z-001 | 21:40 | EX2306_Dive15 Middleton Canyon_20230910T213925.000Z-001 | 22:00 | EX2306_Dive15 Middleton Canyon_20230910T215955.000Z-001 | 22:20 | 8_EX2306_Dive15 Middleton Canyon_20230910T221630.000Z-001 | 22:40 | 9_EX2306_Dive15 Middleton Canyon_20230910T223920.000Z-001 | 23:00 | 10_EX2306_Dive15 Middleton Canyon_20230910T230005.000Z-001 | 9/10/23 20:03 | -145.7151 | 59.24847 | 2173.3429 | 9/10/23 23:14 | -145.7120 | 59.24975 | 2009.0146 | ||||
| EX2306 | 18 | 230618 | 2306-18A | 9/12/23 20:16 | 9/12/23 20:35 | 19.11667 | 1.91 | 20:16 | 20:18 | 20:20 | 20:22 | 20:24 | 20:26 | 20:28 | 20:30 | 20:32 | 20:34 | 9/12/23 20:16 | -147.0123 | 59.07392 | 1823.8740 | 9/12/23 20:35 | -147.0119 | 59.07398 | 1818.4246 | no | wall | wall 1. no lasers | |||||||||||
| EX2306 | 18 | 230618 | 2306-18B | 9/12/23 22:00 | 9/12/23 22:50 | 50.13333 | 5.01 | 22:00 | 1_220228 | 22:05 | 2_220940 | 22:10 | 3_221347 | 22:15 | 4_221905 | 22:20 | 5_222621 | 22:25 | 6_223233 | 22:30 | 7_224327 | 22:35 | 8_224429 | 22:40 | 9_224628 | 22:45 | 10_223200 | 9/12/23 22:00 | -147.0116 | 59.07334 | 1763.9329 | 9/12/23 22:50 | -147.0113 | 59.07323 | 1746.3971 | wall | wall 2 |
# Function to calculate distance, returning NA if any coordinate is NA
calculate_distance <- function(lat1, lon1, lat2, lon2) {
if (is.na(lat1) | is.na(lon1) | is.na(lat2) | is.na(lon2)) {
return(NA)
} else {
return(distHaversine(c(lon1, lat1), c(lon2, lat2)))
}
}
# Calculate the horizontal distance between the coordinates
garden_length_df$horizontal_distance_m <- mapply(calculate_distance, garden_length_df$lat_loc_1, garden_length_df$lon_loc_1, garden_length_df$lat_loc_2, garden_length_df$lon_loc_2)
# Calculate the vertical distance (depth difference) in meters
calculate_vertical_distance <- function(depth1, depth2) {
if (is.na(depth1) | is.na(depth2)) {
return(NA)
} else {
return(abs(depth2 - depth1))
}
}
garden_length_df$vertical_distance_m <- mapply(calculate_vertical_distance, garden_length_df$depth_loc_1, garden_length_df$depth_loc_2)
# Function to calculate depth distance (using pythagoras formula)
calculate_3D_distance <- function(lat1, lon1, depth1, lat2, lon2, depth2) {
if (is.na(lat1) | is.na(lon1) | is.na(depth1) | is.na(lat2) | is.na(lon2) | is.na(depth2)) {
return(NA)
} else {
# Calculate horizontal distance in meters using Haversine formula
horizontal_distance <- distHaversine(c(lon1, lat1), c(lon2, lat2))
# Calculate vertical distance in meters
vertical_distance <- abs(depth2 - depth1)
# Calculate 3D distance using Pythagorean theorem
distance_3d <- sqrt(horizontal_distance^2 + vertical_distance^2)
return(distance_3d)
}
}
# Calculate the 3D distance between the coordinates - total distance
garden_length_df$distance_3D_m <- mapply(calculate_3D_distance, garden_length_df$lat_loc_1, garden_length_df$lon_loc_1, garden_length_df$depth_loc_1, garden_length_df$lat_loc_2, garden_length_df$lon_loc_2, garden_length_df$depth_loc_2)
# Convert Start.Date to POSIXct
data.a$Start.Date <- ymd_hms(data.a$Start.Date, tz = "UTC")
data.a$time <- format(data.a$Start.Date, "%Y-%m-%d %H:%M:%S UTC")
# Format the time column based on Start.Date and convert to POSIXct
data.a$time <- as.POSIXct(format(data.a$Start.Date, "%Y-%m-%d %H:%M:%S UTC"), tz = "UTC")
# Convert the whole column to the desired format
garden_length_df <- garden_length_df %>%
mutate(time_start = mdy_hm(time_start), # Parse the date-time
time_start = format(with_tz(time_start, "UTC"), "%Y-%m-%d %H:%M:%S UTC"), # Convert to UTC and format
time_end = mdy_hm(time_end),
time_end = format(with_tz(time_end, "UTC"), "%Y-%m-%d %H:%M:%S UTC"))
# Also order the rows by time, as right now its not bc of the expansion of the df (counts)
data.a <- data.a %>% arrange(time)
# number of observations in each garden
garden_length_df <- garden_length_df %>%
mutate(n = sapply(1:nrow(garden_length_df), function(i) {
# Get the start and end times for the current region in UTC
start_time <- ymd_hms(garden_length_df$time_start[i], tz = "UTC")
end_time <- ymd_hms(garden_length_df$time_end[i], tz = "UTC")
# Count the number of rows in data.a where time falls within the range of start_time and end_time
num_rows <- nrow(data.a %>% filter(time >= start_time & time <= end_time))
return(num_rows)
}))
#Ind_min#
garden_length_df$ind_min <- garden_length_df$n / as.numeric(garden_length_df$total.time)
#Ind_m# (using the total distance 3D)
garden_length_df$ind_m_traveled <- garden_length_df$n / garden_length_df$distance_3D_m
kable(garden_length_df) %>%
kable_styling(font_size = 5, latex_options = c("striped"))
| expedition | dive_number | dive_name | potential_garden | time_start | time_end | total.time | frame.grab.every.min | frame.grab.1 | frame.ID.1 | frame.grab.2 | frame.ID.2 | frame.grab.3 | frame.ID.3 | frame.grab.4 | frame.ID.4 | frame.grab.5 | frame.ID.5 | frame.grab.6 | frame.ID.6 | frame.grab.7 | frame.ID.7 | frame.grab.8 | frame.ID.8 | frame.grab.9 | frame.ID.9 | frame.grab.10 | frame.ID.10 | time_loc_1 | lon_loc_1 | lat_loc_1 | depth_loc_1 | time_loc_2 | lon_loc_2 | lat_loc_2 | depth_loc_2 | lasers_on | dominating_fauna | elevation | comment | horizontal_distance_m | vertical_distance_m | distance_3D_m | n | ind_min | ind_m_traveled |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| EX2304 | 2 | 230402 | 2304-2A | 2023-07-16 20:36:00 UTC | 2023-07-16 22:05:00 UTC | 89.16667 | 8.92 | 20:36 | 1_203909 | 20:45 | 2_205111 | 20:54 | 3_205459 | 21:03 | 4_205735 | 21:12 | 5_211847 | 21:21 | 6_212126 | 21:30 | 7_213129 | 21:39 | 8_215013 | 9:48:00 PM (no good frame) | 9_213521 | 9:57:00 PM (no good frame) | 10_210645 | 7/16/23 20:36 | -156.5278 | 54.75417 | 2269.5956 | 7/16/23 22:05 | -156.5288 | 54.75486 | 2202.2417 | yes | sponges (PD03, PD02, PD05) | flat | sticks | 98.27062 | 67.3539 | 119.13716 | 954 | 10.699065 | 8.007578 |
| EX2304 | 7 | 230407 | 2304-7A | 2023-07-23 17:39:00 UTC | 2023-07-24 00:06:00 UTC | 386.73333 | 38.67 | 17:39 | 1_174453 | 18:20 | 2_181239 | 19:00 | 3_185428 | 19:40 | 4_193557 | 20:20 | 5_201919 | 21:00 | 6_212438 | 21:40 | 7_215329 | 22:20 | 8_222640 | 23:00 | 9_230000 | 23:40 | 10_233941 | 7/23/23 17:39 | -169.7760 | 53.29347 | 777.2718 | 7/24/23 0:06 | -169.7804 | 53.29140 | 532.4322 | yes | sponge and coral | slight incline | diverse | 371.90054 | 244.8396 | 445.25997 | 4619 | 11.943630 | 10.373715 |
| EX2306 | 3 | 230603 | 2306-3A | 2023-08-26 19:25:00 UTC | 2023-08-26 20:50:00 UTC | 84.88333 | 8.49 | 19:30 | 1_193235 | 19:40 | 2_194145 | 19:50 | 3_nolasers_201343 | 20:00 | 4_200713 | 8:10:00 PM (no lasers) | 5_202350 | 20:20 | 6_203608 | 20:30 | 7_205053 | 20:40 | 8_193355 | 20:50 | 9_nolasers_201453 | 21:00 | 10_202527 | 8/26/23 19:25 | -146.3133 | 56.49172 | 838.7715 | 8/26/23 20:50 | -146.3151 | 56.49096 | 787.8098 | check | check video | 139.56314 | 50.9617 | 148.57646 | 565 | 6.656195 | 3.802756 | ||
| EX2306 | 5 | 230605 | 2306-5A | 2023-08-28 20:10:00 UTC | 2023-08-28 22:47:00 UTC | 157.76667 | 15.78 | 20:10 | 1_201328 | 20:25 | 2_nolasers_202631 | 20:40 | 3_nolasers204228 | 8:55:00 PM (no lasers) | 4_213425 | 9:10:00 PM (no lasers) | 5_214745 | 9:25:00 PM (no lasers) | 6_225655 | 21:40 | 7_230032 | 21:55 | 8_201508 | 10:10:00 PM (changed to earlier as no lasers) | 9_232900 | 10:25:00 PM (changed to earlier as no frames) | 10_202353 | 8/28/23 20:10 | -144.3007 | 56.08964 | 579.7123 | 8/28/23 22:47 | -144.3043 | 56.08717 | 433.9817 | no lasers until 21:34 | 351.88581 | 145.7306 | 380.86878 | 808 | 5.121487 | 2.121465 | |||
| EX2306 | 7 | 230607 | 2306-7A | 2023-08-30 19:06:00 UTC | 2023-08-30 21:35:00 UTC | 148.88333 | 14.89 | 19:06 | 1_190612 | 19:20 | 2_192047 | 7:45:00 PM (no lasers) | 3_211342 | 8:00:00 PM (no lasers) | 4_211700 | 8:15:00 PM (no lasers) | 5_213502 | 8:30:00 PM (no lasers) | 6_nolasers_200347 | 8:45:00 PM (no lasers) | 7_nolasers_201827 | 9:00:00 PM (no lasers) | 8_nolasers_203000 | 21:15 | 9_nolasers_205715 | 21:30 | 10_193037 | 8/30/23 19:06 | -140.8327 | 55.01326 | 3276.3970 | 8/30/23 21:35 | -140.8339 | 55.01235 | 3193.2476 | flat | 127.90890 | 83.1494 | 152.55985 | 557 | 3.741184 | 3.651026 | |||
| EX2306 | 12 | 230612 | 2306-12A | 2023-09-05 19:30:00 UTC | 2023-09-05 21:29:00 UTC | 119.95000 | 12.00 | 19:30 | 1_193344 | 19:42 | 2_202339 | 19:54 | 3_203032 | 20:06 | 4_210207 | 20:12 | 5_210651 | 20:24 | 6_211730 | 20:36 | 7_213513 | 20:48 | 8_214117 | 21:00 | 9_nolasers_20000 | 21:12 | 10_223405 | 9/5/23 19:30 | -134.5189 | 55.03189 | 1585.8863 | 9/5/23 21:29 | -134.5176 | 55.03155 | 1532.3117 | flat | sticks | 88.68787 | 53.5746 | 103.61359 | 486 | 4.051688 | 4.690505 | ||
| EX2306 | 15 | 230615 | 2306-15A | 2023-09-10 20:00:00 UTC | 2023-09-10 23:15:00 UTC | 195.00000 | 19.50 | 20:00 | EX2306_Dive15 Middleton Canyon_20230910T200340.000Z-001 | 20:20 | EX2306_Dive15 Middleton Canyon_20230910T202410.000Z-001 | 20:40 | EX2306_Dive15 Middleton Canyon_20230910T203735.000Z-001 | 21:00 | EX2306_Dive15 Middleton Canyon_20230910T205500.000Z-001 | 21:20 | EX2306_Dive15 Middleton Canyon_20230910T212710.000Z-001 | 21:40 | EX2306_Dive15 Middleton Canyon_20230910T213925.000Z-001 | 22:00 | EX2306_Dive15 Middleton Canyon_20230910T215955.000Z-001 | 22:20 | 8_EX2306_Dive15 Middleton Canyon_20230910T221630.000Z-001 | 22:40 | 9_EX2306_Dive15 Middleton Canyon_20230910T223920.000Z-001 | 23:00 | 10_EX2306_Dive15 Middleton Canyon_20230910T230005.000Z-001 | 9/10/23 20:03 | -145.7151 | 59.24847 | 2173.3429 | 9/10/23 23:14 | -145.7120 | 59.24975 | 2009.0146 | 228.13187 | 164.3283 | 281.15465 | 728 | 3.733333 | 2.589322 | ||||
| EX2306 | 18 | 230618 | 2306-18A | 2023-09-12 20:16:00 UTC | 2023-09-12 20:35:00 UTC | 19.11667 | 1.91 | 20:16 | 20:18 | 20:20 | 20:22 | 20:24 | 20:26 | 20:28 | 20:30 | 20:32 | 20:34 | 9/12/23 20:16 | -147.0123 | 59.07392 | 1823.8740 | 9/12/23 20:35 | -147.0119 | 59.07398 | 1818.4246 | no | wall | wall 1. no lasers | 24.32872 | 5.4494 | 24.93156 | 222 | 11.612903 | 8.904378 | |||||||||||
| EX2306 | 18 | 230618 | 2306-18B | 2023-09-12 22:00:00 UTC | 2023-09-12 22:50:00 UTC | 50.13333 | 5.01 | 22:00 | 1_220228 | 22:05 | 2_220940 | 22:10 | 3_221347 | 22:15 | 4_221905 | 22:20 | 5_222621 | 22:25 | 6_223233 | 22:30 | 7_224327 | 22:35 | 8_224429 | 22:40 | 9_224628 | 22:45 | 10_223200 | 9/12/23 22:00 | -147.0116 | 59.07334 | 1763.9329 | 9/12/23 22:50 | -147.0113 | 59.07323 | 1746.3971 | wall | wall 2 | 17.67383 | 17.5358 | 24.89716 | 370 | 7.380319 | 14.861131 |
#To save this file use:
#library(openxlsx)
#write.xlsx(garden_length_df, "output.xlsx")
Now we have detailed information for each forest. Next the density data will be analysed. Th
I put this in the same df as the length data, but as sheets. Now I saved each individual sheet as a file, to be able to work with it. All files are availabe from the Supplementary File (just save each sheet separately)
# Read all garden files
# You may need to adapt the path
file_paths <- c(
"./garden densities/D02_2304_G02A.xlsx", "./garden densities/D03_2306_G03A.xlsx",
"./garden densities/D05_2306_G05A.xlsx", "./garden densities/D07_2304_G07A.xlsx",
"./garden densities/D07_2306_G07A.xlsx", "./garden densities/D12_2306_G12A.xlsx",
"./garden densities/D15_2306_G15A.xlsx", "./garden densities/D18_2306_G18B.xlsx"
)
garden_ids <- c("D02_2304_G02A", "D03_2306_G03A", "D05_2306_G05A", "D07_2304_G07A", "2306-G07A", "D12_2306_G12A", "D15_2306_G15A", "D18_2306_G18B")
# Read data and assign garden_ids
list_of_dfs <- lapply(seq_along(file_paths), function(i) {
df <- read_excel(file_paths[i])
df$garden_id <- garden_ids[i]
return(df)
})
# Function to calculate average density per morphotype
summarize_density_df <- function(df) {
df %>%
group_by(garden_id) %>%
summarise(across(starts_with("P") | starts_with("C"), ~ mean(.x / area_squared, na.rm = TRUE))) %>%
pivot_longer(cols = -garden_id, names_to = "Morphotype", values_to = "density")
}
# Apply function to all garden data
gardens_df <- bind_rows(lapply(list_of_dfs, summarize_density_df), .id = "df_id")
# Join with taxa data (assumed to be available)
gardens_df <- gardens_df %>%
left_join(taxa, by = "Morphotype")
# Plot function to simplify repeated plot code
plot_density <- function(data, group_col, fill_col, title) {
data %>%
group_by(garden_id, !!sym(group_col)) %>%
summarise(total_density = sum(density)) %>%
mutate(prop_density = total_density / sum(total_density)) %>%
ggplot(aes(x = garden_id, y = total_density, fill = !!sym(group_col))) +
geom_bar(stat = "identity", aes(weight = prop_density), position = "stack") +
labs(x = "Garden ID", y = "Individuals per m^2", title = title) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
}
# Plot densities by Phylum, Morphotype, and Family
plot_density(gardens_df, "Phylum", "Phylum", "Densities by Phylum")
## `summarise()` has grouped output by 'garden_id'. You can override using the
## `.groups` argument.
## Warning in geom_bar(stat = "identity", aes(weight = prop_density), position =
## "stack"): Ignoring unknown aesthetics: weight
plot_density(gardens_df, "Morphotype", "Morphotype", "Densities by Morphotype")
## `summarise()` has grouped output by 'garden_id'. You can override using the
## `.groups` argument.
## Warning in geom_bar(stat = "identity", aes(weight = prop_density), position =
## "stack"): Ignoring unknown aesthetics: weight
plot_density(gardens_df, "Family", "Family", "Densities by Family")
## `summarise()` has grouped output by 'garden_id'. You can override using the
## `.groups` argument.
## Warning in geom_bar(stat = "identity", aes(weight = prop_density), position =
## "stack"): Ignoring unknown aesthetics: weight
# Calculate sponge-dominated gardens
sponge_dominated <- gardens_df %>%
group_by(garden_id, Phylum) %>%
summarise(total_density = sum(density, na.rm = TRUE)) %>%
group_by(garden_id) %>%
mutate(proportion = total_density / sum(total_density, na.rm = TRUE)) %>%
filter(Phylum == "Porifera") %>%
mutate(is_sponge_dominated = proportion >= 0.7)
## `summarise()` has grouped output by 'garden_id'. You can override using the
## `.groups` argument.
# Output results
kable(sponge_dominated) %>% kable_styling(font_size = 7, latex_options = c("striped"))
| garden_id | Phylum | total_density | proportion | is_sponge_dominated |
|---|---|---|---|---|
| 2306-G07A | Porifera | 2.5407633 | 0.8775114 | TRUE |
| D02_2304_G02A | Porifera | 20.4033815 | 0.9867127 | TRUE |
| D03_2306_G03A | Porifera | 0.6437813 | 0.2416753 | FALSE |
| D05_2306_G05A | Porifera | 0.1644149 | 0.1264180 | FALSE |
| D07_2304_G07A | Porifera | 1.5121467 | 0.1809220 | FALSE |
| D12_2306_G12A | Porifera | 5.9436183 | 0.9657297 | TRUE |
| D15_2306_G15A | Porifera | 0.1561155 | 0.0665035 | FALSE |
| D18_2306_G18B | Porifera | 1.5929045 | 0.6335733 | FALSE |
# Calculate coral-dominated gardens
coral_dominated <- gardens_df %>%
group_by(garden_id, Phylum) %>%
summarise(total_density = sum(density, na.rm = TRUE)) %>%
group_by(garden_id) %>%
mutate(proportion = total_density / sum(total_density, na.rm = TRUE)) %>%
filter(Phylum == "Cnidaria") %>%
mutate(is_coral_dominated = proportion >= 0.7)
## `summarise()` has grouped output by 'garden_id'. You can override using the
## `.groups` argument.
# Output results
kable(coral_dominated) %>% kable_styling(font_size = 7, latex_options = c("striped"))
| garden_id | Phylum | total_density | proportion | is_coral_dominated |
|---|---|---|---|---|
| 2306-G07A | Cnidaria | 0.3546558 | 0.1224886 | FALSE |
| D02_2304_G02A | Cnidaria | 0.2747567 | 0.0132873 | FALSE |
| D03_2306_G03A | Cnidaria | 2.0200465 | 0.7583247 | TRUE |
| D05_2306_G05A | Cnidaria | 1.1361501 | 0.8735820 | TRUE |
| D07_2304_G07A | Cnidaria | 6.8458576 | 0.8190780 | TRUE |
| D12_2306_G12A | Cnidaria | 0.2109180 | 0.0342703 | FALSE |
| D15_2306_G15A | Cnidaria | 2.1913624 | 0.9334965 | TRUE |
| D18_2306_G18B | Cnidaria | 0.9212551 | 0.3664267 | FALSE |
When comparing both df’s one can see that only Dive 18 Gumpy Ridge has false for both, thus it is a true mixed forest.
This code snippet is useful for looking at exact transect lengths - can be used to update the above values if needed. CHECK IF THIS CAN BE USED INSTEAD ABOVE??
###Finding transect length of each dive using a smooth line through time-sorted coordinates
# Function to calculate the length of a smooth curve using SPLINE
calculate_curve_length <- function(df) {
df <- df %>% arrange(time)
df$time_numeric <- as.numeric(df$time)
smooth_lat <- smooth.spline(df$time_numeric, df$Latitude..deg., spar = 1)
smooth_lon <- smooth.spline(df$time_numeric, df$Longitude..deg., spar = 1)
time_seq <- seq(min(df$time_numeric), max(df$time_numeric), length.out = 100)
smooth_lat_values <- predict(smooth_lat, time_seq)$y
smooth_lon_values <- predict(smooth_lon, time_seq)$y
smooth_coords <- data.frame(lat = smooth_lat_values, lon = smooth_lon_values)
calculate_distance <- function(lat1, lon1, lat2, lon2) {
distHaversine(cbind(lon1, lat1), cbind(lon2, lat2))
}
sum(mapply(calculate_distance, head(smooth_coords$lat, -1), head(smooth_coords$lon, -1),
tail(smooth_coords$lat, -1), tail(smooth_coords$lon, -1)))
}
# Apply the function to each dive
lengths <- data.a %>%
group_by(Dive.Name) %>%
summarise(Length = calculate_curve_length(cur_data()))
## Warning: There was 1 warning in `summarise()`.
## ℹ In argument: `Length = calculate_curve_length(cur_data())`.
## ℹ In group 1: `Dive.Name = "EX2304_DIVE02 Big Bend Canyon"`.
## Caused by warning:
## ! `cur_data()` was deprecated in dplyr 1.1.0.
## ℹ Please use `pick()` instead.
print(lengths) #looks good. I tried to double check if there is any value to looks out of range, doesnt correspond to the transect in Seatube (comparing dives)
## # A tibble: 15 × 2
## Dive.Name Length
## <chr> <dbl>
## 1 EX2304_DIVE02 Big Bend Canyon 436.
## 2 EX2304_DIVE05 Lone Knoll Scarp 301.
## 3 EX2304_DIVE07 Uliaga Mound Take 2 381.
## 4 EX2304_DIVE08 Umnak Canyon 176.
## 5 EX2306_Dive01 Kodiak Shelf 328.
## 6 EX2306_Dive03 Giacomini Seamount 597.
## 7 EX2306_Dive04 Quinn Seamount 431.
## 8 EX2306_Dive05 Surveyor Seamount 728.
## 9 EX2306_Dive06 Durgin Guyot 403.
## 10 EX2306_Dive07 Deep Discover Dome 279.
## 11 EX2306_Dive08 Denson Seamount 531.
## 12 EX2306_Dive12 Noyes Canyon 305.
## 13 EX2306_Dive14 Chatham Seep 593.
## 14 EX2306_Dive15 Middleton Canyon 284.
## 15 EX2306_Dive18 Gumby Ridge 242.
#Calculate total distance of all dives
sum(lengths$Length)
## [1] 6014.612
transect_lengths <- lengths %>%
mutate(Dive.Name = case_when(
Dive.Name == "EX2304_DIVE02 Big Bend Canyon" ~"2304_02" ,
Dive.Name == "EX2304_DIVE05 Lone Knoll Scarp" ~ "2304_05" ,
Dive.Name == "EX2304_DIVE07 Uliaga Mound Take 2" ~ "2304_07",
Dive.Name == "EX2304_DIVE08 Umnak Canyon" ~ "2304_08",
Dive.Name == "EX2306_Dive01 Kodiak Shelf" ~ "2306_01",
Dive.Name == "EX2306_Dive03 Giacomini Seamount" ~ "2306_03",
Dive.Name == "EX2306_Dive04 Quinn Seamount" ~ "2306_04" ,
Dive.Name == "EX2306_Dive05 Surveyor Seamount" ~ "2306_05" ,
Dive.Name == "EX2306_Dive06 Durgin Guyot" ~ "2306_06",
Dive.Name == "EX2306_Dive07 Deep Discover Dome" ~ "2306_07",
Dive.Name == "EX2306_Dive08 Denson Seamount" ~ "2306_08",
Dive.Name == "EX2306_Dive12 Noyes Canyon" ~ "2306_12" ,
Dive.Name == "EX2306_Dive14 Chatham Seep" ~ "2306_14" ,
Dive.Name == "EX2306_Dive15 Middleton Canyon" ~ "2306_15",
Dive.Name == "EX2306_Dive18 Gumby Ridge" ~"2306_18" ,
TRUE ~ Dive.Name
))
# To verify this function, I wanted to also plot the transects with coordinates and smooth-fitted line
# Function to create a plot for a specific dive - mostly to check if the transect data is ok
plot_transect_for_dive <- function(dive_id) {
df <- data.a %>% filter(Dive.Name == dive_id) %>% arrange(time)
df$time_numeric <- as.numeric(df$time)
smooth_lat <- smooth.spline(df$time_numeric, df$Latitude..deg., spar = 1)
smooth_lon <- smooth.spline(df$time_numeric, df$Longitude..deg., spar = 1)
time_seq <- seq(min(df$time_numeric), max(df$time_numeric), length.out = 100)
smooth_lat_values <- predict(smooth_lat, time_seq)$y
smooth_lon_values <- predict(smooth_lon, time_seq)$y
smooth_coords <- data.frame(lat = smooth_lat_values, lon = smooth_lon_values)
ggplot() +
geom_point(data = df, aes(x = Longitude..deg., y = Latitude..deg.), color = "blue", size = 2) +
geom_line(data = smooth_coords, aes(x = lon, y = lat), color = "red", size = 1) +
labs(title = paste("Transect Line for Dive:", dive_id), x = "Longitude", y = "Latitude") +
theme_minimal()
}
# Plots per dive
unique(data.a$Dive.Name) #14 dives
## [1] "EX2304_DIVE02 Big Bend Canyon" "EX2304_DIVE05 Lone Knoll Scarp"
## [3] "EX2304_DIVE07 Uliaga Mound Take 2" "EX2304_DIVE08 Umnak Canyon"
## [5] "EX2306_Dive01 Kodiak Shelf" "EX2306_Dive03 Giacomini Seamount"
## [7] "EX2306_Dive04 Quinn Seamount" "EX2306_Dive05 Surveyor Seamount"
## [9] "EX2306_Dive06 Durgin Guyot" "EX2306_Dive07 Deep Discover Dome"
## [11] "EX2306_Dive08 Denson Seamount" "EX2306_Dive12 Noyes Canyon"
## [13] "EX2306_Dive14 Chatham Seep" "EX2306_Dive15 Middleton Canyon"
## [15] "EX2306_Dive18 Gumby Ridge"
plot_transect_for_dive("EX2306_Dive01 Kodiak Shelf") #needs range of 1
plot_transect_for_dive("EX2306_Dive04 Quinn Seamount")
plot_transect_for_dive("EX2306_Dive06 Durgin Guyot")
plot_transect_for_dive("EX2306_Dive08 Denson Seamount")
plot_transect_for_dive("EX2304_DIVE08 Umnak Canyon")
plot_transect_for_dive("EX2306_Dive14 Chatham Seep") #Nope
plot_transect_for_dive("EX2306_Dive18 Gumby Ridge")
plot_transect_for_dive("EX2304_DIVE05 Lone Knoll Scarp") #needs range 1. There is one data point very far away, ignored in the length!
plot_transect_for_dive("EX2306_Dive03 Giacomini Seamount")
plot_transect_for_dive("EX2306_Dive05 Surveyor Seamount")
plot_transect_for_dive("EX2306_Dive07 Deep Discover Dome")
plot_transect_for_dive("EX2306_Dive15 Middleton Canyon")
plot_transect_for_dive("EX2304_DIVE02 Big Bend Canyon")
plot_transect_for_dive("EX2304_DIVE07 Uliaga Mound Take 2")
#Works well except for Dive 14 2306
#Problem: for 14 it jumps but the reason is not clear to me,
# I checked with the below code to see if I can get lengths by doing fragments. The length is similar to the one in the above table (from the smooth fitting), so just take this as a good estimate
#Dive 14
# irst Isolate the dive
data_d14 <- data.a %>% filter(Dive.Name == "EX2306_Dive14 Chatham Seep")
# Plot the raw path
ggplot(data_d14, aes(x = Longitude..deg., y = Latitude..deg., color = time)) +
geom_point(size = 2) + # Plot points
geom_line(aes(group = 1), color = "blue") + # Plot path as a line
labs(title = "Transect Path for Dive 14",
x = "Longitude",
y = "Latitude") +
theme_minimal() +
scale_color_datetime() # Color points by time if needed
# Define time intervals
start_time_1 <- ymd_hms("2023-09-07 21:00:00")
end_time_1 <- ymd_hms("2023-09-07 23:59:59")
start_time_2 <- ymd_hms("2023-09-07 18:00:00")
end_time_2 <- ymd_hms("2023-09-07 20:59:59")
# Filter data into two fragments
fragment_1 <- data_d14 %>% filter(time >= start_time_1 & time <= end_time_1)
fragment_2 <- data_d14 %>% filter(time >= start_time_2 & time <= end_time_2)
# Plot the fragments
ggplot() +
geom_point(data = fragment_1, aes(x = Longitude..deg., y = Latitude..deg.), color = "blue", size = 2) +
geom_line(data = fragment_1, aes(x = Longitude..deg., y = Latitude..deg.), color = "blue") +
geom_point(data = fragment_2, aes(x = Longitude..deg., y = Latitude..deg.), color = "red", size = 2) +
geom_line(data = fragment_2, aes(x = Longitude..deg., y = Latitude..deg.), color = "red") +
labs(title = "Transect Path for Dive 14 with Time Fragments",
x = "Longitude",
y = "Latitude") +
theme_minimal()
#Ok looks better, now smooth the line through the two fragments and combine to one length
# Apply smoothing
apply_spline_smoothing <- function(df, spar = 0.5) {
smooth_lat <- smooth.spline(df$time_numeric, df$Latitude..deg., spar = spar)
smooth_lon <- smooth.spline(df$time_numeric, df$Longitude..deg., spar = spar)
df$lat_smooth <- predict(smooth_lat, df$time_numeric)$y
df$lon_smooth <- predict(smooth_lon, df$time_numeric)$y
return(df)
}
# Prepare data for smoothing
fragment_1$time_numeric <- as.numeric(fragment_1$time)
# Apply spline smoothing
fragment_1_smooth <- apply_spline_smoothing(fragment_1, spar = 1.3)
# Plot Fragment 1 with spline smoothing
ggplot() +
geom_point(data = fragment_1, aes(x = Longitude..deg., y = Latitude..deg.), color = "blue", size = 2) +
geom_line(data = fragment_1, aes(x = Longitude..deg., y = Latitude..deg.), color = "blue") +
geom_line(data = fragment_1_smooth, aes(x = lon_smooth, y = lat_smooth), color = "red") +
labs(title = "Fragment 1: Transect Path with Spline Smoothing",
x = "Longitude",
y = "Latitude") +
theme_minimal()
# Prepare data for smoothing
fragment_2$time_numeric <- as.numeric(fragment_2$time)
fragment_2_smooth <- apply_spline_smoothing(fragment_2, spar = 0.5)
# Plot Fragment 2 with spline smoothing
ggplot() +
geom_point(data = fragment_2, aes(x = Longitude..deg., y = Latitude..deg.), color = "red", size = 2) +
geom_line(data = fragment_2, aes(x = Longitude..deg., y = Latitude..deg.), color = "red") +
geom_line(data = fragment_2_smooth, aes(x = lon_smooth, y = lat_smooth), color = "darkred") +
labs(title = "Fragment 2: Transect Path with Spline Smoothing",
x = "Longitude",
y = "Latitude") +
theme_minimal()
calculate_path_length <- function(df) {
# Ensure data is sorted by time
df <- df %>% arrange(time_numeric)
# Calculate distances between consecutive points
distances <- distHaversine(as.matrix(df[, c("lon_smooth", "lat_smooth")]))
# Sum the distances
total_length <- sum(distances, na.rm = TRUE)
return(total_length)
}
# Convert 'time' column to numeric for the function
fragment_1$time_numeric <- as.numeric(fragment_1$time)
fragment_2$time_numeric <- as.numeric(fragment_2$time)
# Calculate path length for Fragment 1
length_fragment_1 <- calculate_path_length(fragment_1_smooth)
print(paste("Length of Fragment 1:", length_fragment_1, "meters"))
## [1] "Length of Fragment 1: 172.394302710488 meters"
# Calculate path length for Fragment 2
length_fragment_2 <- calculate_path_length(fragment_2_smooth)
print(paste("Length of Fragment 2:", length_fragment_2, "meters"))
## [1] "Length of Fragment 2: 427.701379863488 meters"
#For both in total:
length_fragment_1+length_fragment_2 #600 m, so almost same as in the above table... dont know why the plot is so weird. But the value seems okay
## [1] 600.0957
# Shannon's H' (for each dive)
H <- tapply(data.a$Morphotype, data.a$Dive.Name, function(x) {
species_abundance <- table(x)
diversity_index <- diversity(species_abundance, index = "shannon")
return(diversity_index)
})
# Simpsons (for each dive)
S <- tapply(data.a$Morphotype, data.a$Dive.Name, function(x) {
species_abundance <- table(x)
diversity_index <- diversity(species_abundance, index = "simpson")
return(diversity_index)
})
# Observed Richness
richness <- tapply(data.a$Morphotype, data.a$Dive.Name, function(x) {
species_abundance <- table(x)
diversity_index <- specnumber(species_abundance)
return(diversity_index)
})
# Pielou's Evenness
#Evenness is a measure of the relative abundance of the different species in the same area.
#Low J indicates that 1 or few species dominate the community
evenness <- H/log(richness)
# Create alpha diversity dataframe, include info about environmnent and locations
div.df <- cbind(shannon = H, richness = richness, pielou = evenness, simps = S, ENV2)
div.df$site <- data.c[,1]
kable(div.df) %>% kable_styling(font_size = 8, latex_options = c("striped"))
| shannon | richness | pielou | simps | sed_type | sed_consistency | oxygen | depth | temp | eastness | northness | rugosity | slope | bs | feature | site | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| EX2304_DIVE02 Big Bend Canyon | 0.9983277 | 14 | 0.3782895 | 0.4846525 | 3 | 2 | 2.1962855 | 2228.7927 | 1.827136 | 0.4689163 | 0.1141978 | 9.129195 | 37.35683 | 121.78761 | 1 | 2304_02 |
| EX2304_DIVE05 Lone Knoll Scarp | 1.5552652 | 9 | 0.7078317 | 0.7279378 | 3 | 2 | 3.1782767 | 2777.7042 | 1.613341 | -0.1479803 | 0.5893032 | 30.086932 | 27.89061 | 18.81481 | 5 | 2304_05 |
| EX2304_DIVE07 Uliaga Mound Take 2 | 2.5956375 | 33 | 0.7423514 | 0.9069230 | 5 | 3 | 0.9290499 | 657.7798 | 3.490384 | 0.1582887 | 0.0527446 | 14.017655 | 40.48497 | 75.80089 | 3 | 2304_07 |
| EX2304_DIVE08 Umnak Canyon | 0.0000000 | 1 | NaN | 0.0000000 | 1 | 1 | 1.0951864 | 1460.0114 | 2.271012 | -0.4666053 | -0.8508585 | 7.879064 | 15.01833 | 50.47222 | 1 | 2304_08 |
| EX2306_Dive01 Kodiak Shelf | 1.7023175 | 15 | 0.6286137 | 0.6916563 | 5 | 3 | 3.4607124 | 3033.9322 | 1.562592 | 0.3753842 | 0.9113762 | 82.076497 | 18.81724 | 82.02124 | 5 | 2306_01 |
| EX2306_Dive03 Giacomini Seamount | 2.1498557 | 23 | 0.6856513 | 0.8408186 | 5 | 3 | 0.5162314 | 814.8627 | 3.100999 | 0.6368190 | 0.7685244 | 53.575081 | 22.37613 | 151.49925 | 3 | 2306_03 |
| EX2306_Dive04 Quinn Seamount | 1.8524738 | 8 | 0.8908516 | 0.8270360 | 5 | 3 | 1.6349495 | 1929.7082 | 1.985380 | 0.8025602 | -0.4711487 | 60.576255 | 25.13941 | 125.69265 | 3 | 2306_04 |
| EX2306_Dive05 Surveyor Seamount | 1.9745089 | 24 | 0.6212950 | 0.8090708 | 5 | 3 | 0.6544988 | 513.0706 | 3.779863 | 0.3423503 | 0.9244736 | 61.071748 | 28.17219 | 29.21577 | 3 | 2306_05 |
| EX2306_Dive06 Durgin Guyot | 2.4916258 | 25 | 0.7740671 | 0.8880535 | 5 | 3 | 0.5487520 | 1131.5768 | 2.805364 | 0.8483461 | -0.4928408 | 77.441229 | 33.81878 | 159.94237 | 3 | 2306_06 |
| EX2306_Dive07 Deep Discover Dome | 0.8830443 | 23 | 0.2816284 | 0.3167201 | 5 | 3 | 3.5517937 | 3227.4160 | 1.547009 | 0.6927170 | 0.7062478 | 86.166682 | 18.31629 | 72.88986 | 3 | 2306_07 |
| EX2306_Dive08 Denson Seamount | 1.8164333 | 21 | 0.5966234 | 0.6812386 | 5 | 3 | 0.6390790 | 1362.0356 | 2.580212 | 0.4377730 | 0.8921247 | 89.790400 | 27.66106 | NaN | 3 | 2306_08 |
| EX2306_Dive12 Noyes Canyon | 0.5710963 | 15 | 0.2108884 | 0.2176131 | 2 | 1 | 1.0621382 | 1544.2669 | 2.330919 | -0.3129675 | 0.9497064 | 149.337704 | 32.59931 | 79.33240 | 1 | 2306_12 |
| EX2306_Dive14 Chatham Seep | 1.6203378 | 19 | 0.5503044 | 0.7487424 | 7 | 3 | 0.6125767 | 719.7000 | 4.071347 | 0.0294458 | -0.9181381 | 43.586525 | 10.58621 | 170.93689 | 4 | 2306_14 |
| EX2306_Dive15 Middleton Canyon | 2.2664271 | 37 | 0.6276595 | 0.8001073 | 5 | 3 | 2.1241257 | 2107.5228 | 1.820679 | -0.5628847 | -0.8262024 | 175.290324 | 33.84623 | 149.72432 | 1 | 2306_15 |
| EX2306_Dive18 Gumby Ridge | 1.8484203 | 30 | 0.5434616 | 0.7088397 | 3 | 2 | 1.3861362 | 1802.4660 | 2.089785 | -0.2675006 | 0.9148033 | 104.594112 | 24.40463 | 147.32355 | 2 | 2306_18 |
# NOTE: Dive 8 2304 has only one morphotype, so NA
# can also sort Locations by the median of 'shannon'
# div.df$site <- with(div.df, reorder(site, shannon, median))
plot.shan <- ggplot(div.df, aes(x = site, y = shannon, colour = depth, shape = as.factor(sed_consistency))) +
geom_point(size = 3) +
scale_colour_viridis_c(option = "cividis", begin = 1, end = 0) +
ylab("Shannon's H") +
xlab("") +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.4),
plot.margin = unit(c(0, 0, 0, 0), "cm")) # Ensure margins are set
#scale_shape_manual(values = c(15, 16, 17, 18, 19, 20, 21, 22, 23)) # Customize the shapes as needed
plot.shan
plot.rich <- ggplot(div.df, aes(x = site, y = richness, colour = depth, shape = as.factor(sed_consistency))) +
geom_point(size = 3) +
scale_colour_viridis_c(option = "cividis", begin = 1, end = 0) +
ylab("Richness") +
xlab("") +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.4),
plot.margin = unit(c(0, 0, 0, 0), "cm")) # Ensure margins are set
#scale_shape_manual(values = c(15, 16, 17, 18, 19, 20, 21, 22, 23)) # Customize the shapes as needed
plot.rich
plot.even <- ggplot(div.df, aes(x = site, y = pielou, colour = depth, shape = as.factor(sed_consistency))) +
geom_point(size = 3) +
scale_colour_viridis_c(option = "cividis", begin = 1, end = 0) +
ylab("Pielous eveness") +
xlab("") +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.4),
plot.margin = unit(c(0, 0, 0, 0), "cm")) # Ensure margins are set
#scale_shape_manual(values = c(15, 16, 17, 18, 19, 20, 21, 22, 23)) # Customize the shapes as needed
plot.even
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
> Now, we explore the alpha diversity further
# What impacts have the various environmental factors on the diversity?
# Define response variables
responses <- c("shannon", "simps", "pielou", "richness")
# Define predictor variables
predictors <- c("depth", "sed_type", "sed_consistency", "temp", "slope", "feature")
# Run linear models and print summaries
lm_results <- lapply(responses, function(resp) {
cat("\n### Response:", resp, "###\n")
lapply(predictors, function(pred) {
cat("\nModel:", resp, "~", pred, "\n")
print(summary(lm(as.formula(paste(resp, "~", pred)), data = div.df)))
})})
##
## ### Response: shannon ###
##
## Model: shannon ~ depth
##
## Call:
## lm(formula = as.formula(paste(resp, "~", pred)), data = div.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.6862 -0.2888 0.2429 0.3811 0.7639
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.1005074 0.4102993 5.119 0.000197 ***
## depth -0.0002837 0.0002178 -1.303 0.215191
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.707 on 13 degrees of freedom
## Multiple R-squared: 0.1155, Adjusted R-squared: 0.04747
## F-statistic: 1.698 on 1 and 13 DF, p-value: 0.2152
##
##
## Model: shannon ~ sed_type
##
## Call:
## lm(formula = as.formula(paste(resp, "~", pred)), data = div.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.97387 -0.27041 -0.00444 0.37465 0.73873
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.25333 0.43429 0.583 0.56966
## sed_type 0.32072 0.09615 3.335 0.00537 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5518 on 13 degrees of freedom
## Multiple R-squared: 0.4611, Adjusted R-squared: 0.4197
## F-statistic: 11.13 on 1 and 13 DF, p-value: 0.005369
##
##
## Model: shannon ~ sed_consistency
##
## Call:
## lm(formula = as.formula(paste(resp, "~", pred)), data = div.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.08915 -0.24636 0.00232 0.31416 0.62724
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.2808 0.4536 -0.619 0.546561
## sed_consistency 0.7510 0.1723 4.359 0.000774 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4791 on 13 degrees of freedom
## Multiple R-squared: 0.5938, Adjusted R-squared: 0.5625
## F-statistic: 19 on 1 and 13 DF, p-value: 0.0007739
##
##
## Model: shannon ~ temp
##
## Call:
## lm(formula = as.formula(paste(resp, "~", pred)), data = div.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.5623 -0.4366 0.2014 0.3726 0.8468
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.8426 0.5870 1.435 0.175
## temp 0.3169 0.2272 1.395 0.186
##
## Residual standard error: 0.701 on 13 degrees of freedom
## Multiple R-squared: 0.1302, Adjusted R-squared: 0.06333
## F-statistic: 1.947 on 1 and 13 DF, p-value: 0.1863
##
##
## Model: shannon ~ slope
##
## Call:
## lm(formula = as.formula(paste(resp, "~", pred)), data = div.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.2567 -0.2913 0.2945 0.4506 0.6637
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.73831 0.60606 1.218 0.245
## slope 0.03342 0.02191 1.525 0.151
##
## Residual standard error: 0.6923 on 13 degrees of freedom
## Multiple R-squared: 0.1518, Adjusted R-squared: 0.08659
## F-statistic: 2.327 on 1 and 13 DF, p-value: 0.1511
##
##
## Model: shannon ~ feature
##
## Call:
## lm(formula = as.formula(paste(resp, "~", pred)), data = div.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.2670 -0.4568 0.1401 0.4252 0.9994
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.0624 0.4214 2.521 0.0255 *
## feature 0.2046 0.1394 1.468 0.1660
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6962 on 13 degrees of freedom
## Multiple R-squared: 0.1421, Adjusted R-squared: 0.07615
## F-statistic: 2.154 on 1 and 13 DF, p-value: 0.166
##
##
## ### Response: simps ###
##
## Model: simps ~ depth
##
## Call:
## lm(formula = as.formula(paste(resp, "~", pred)), data = div.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.66476 -0.05015 0.07641 0.18155 0.20662
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.026e-01 1.538e-01 5.217 0.000166 ***
## depth -9.442e-05 8.165e-05 -1.156 0.268330
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2651 on 13 degrees of freedom
## Multiple R-squared: 0.09327, Adjusted R-squared: 0.02352
## F-statistic: 1.337 on 1 and 13 DF, p-value: 0.2683
##
##
## Model: simps ~ sed_type
##
## Call:
## lm(formula = as.formula(paste(resp, "~", pred)), data = div.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.41738 -0.09893 0.06601 0.13034 0.24149
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.11497 0.15474 0.743 0.47070
## sed_type 0.12383 0.03426 3.614 0.00314 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1966 on 13 degrees of freedom
## Multiple R-squared: 0.5012, Adjusted R-squared: 0.4629
## F-statistic: 13.06 on 1 and 13 DF, p-value: 0.003144
##
##
## Model: simps ~ sed_consistency
##
## Call:
## lm(formula = as.formula(paste(resp, "~", pred)), data = div.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.45610 -0.05262 0.02729 0.09162 0.23267
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.05984 0.16851 -0.355 0.728208
## sed_consistency 0.27755 0.06400 4.337 0.000806 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.178 on 13 degrees of freedom
## Multiple R-squared: 0.5913, Adjusted R-squared: 0.5599
## F-statistic: 18.81 on 1 and 13 DF, p-value: 0.0008061
##
##
## Model: simps ~ temp
##
## Call:
## lm(formula = as.formula(paste(resp, "~", pred)), data = div.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.62117 -0.08454 0.10906 0.16926 0.23958
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.35308 0.21722 1.625 0.128
## temp 0.11805 0.08405 1.404 0.184
##
## Residual standard error: 0.2594 on 13 degrees of freedom
## Multiple R-squared: 0.1317, Adjusted R-squared: 0.06496
## F-statistic: 1.973 on 1 and 13 DF, p-value: 0.1836
##
##
## Model: simps ~ slope
##
## Call:
## lm(formula = as.formula(paste(resp, "~", pred)), data = div.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.53404 -0.11135 0.08585 0.16159 0.25713
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.390288 0.232378 1.680 0.117
## slope 0.009572 0.008400 1.139 0.275
##
## Residual standard error: 0.2654 on 13 degrees of freedom
## Multiple R-squared: 0.0908, Adjusted R-squared: 0.02087
## F-statistic: 1.298 on 1 and 13 DF, p-value: 0.2751
##
##
## Model: simps ~ feature
##
## Call:
## lm(formula = as.formula(paste(resp, "~", pred)), data = div.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.47315 -0.15600 0.01177 0.16446 0.32696
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.37498 0.14703 2.550 0.0242 *
## feature 0.09816 0.04865 2.018 0.0648 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2429 on 13 degrees of freedom
## Multiple R-squared: 0.2385, Adjusted R-squared: 0.1799
## F-statistic: 4.071 on 1 and 13 DF, p-value: 0.06476
##
##
## ### Response: pielou ###
##
## Model: pielou ~ depth
##
## Call:
## lm(formula = as.formula(paste(resp, "~", pred)), data = div.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.38699 -0.08173 0.01656 0.11165 0.31556
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.883e-01 1.112e-01 6.191 4.65e-05 ***
## depth -5.859e-05 5.819e-05 -1.007 0.334
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1884 on 12 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.0779, Adjusted R-squared: 0.001062
## F-statistic: 1.014 on 1 and 12 DF, p-value: 0.3339
##
##
## Model: pielou ~ sed_type
##
## Call:
## lm(formula = as.formula(paste(resp, "~", pred)), data = div.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.33780 -0.09387 0.00870 0.10874 0.27142
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.31047 0.17909 1.734 0.109
## sed_type 0.06179 0.03837 1.610 0.133
##
## Residual standard error: 0.1779 on 12 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.1777, Adjusted R-squared: 0.1092
## F-statistic: 2.594 on 1 and 12 DF, p-value: 0.1333
##
##
## Model: pielou ~ sed_consistency
##
## Call:
## lm(formula = as.formula(paste(resp, "~", pred)), data = div.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.36796 -0.08586 -0.02145 0.08578 0.24127
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.13677 0.19078 0.717 0.4872
## sed_consistency 0.17094 0.07034 2.430 0.0317 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1606 on 12 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.3298, Adjusted R-squared: 0.274
## F-statistic: 5.906 on 1 and 12 DF, p-value: 0.03172
##
##
## Model: pielou ~ temp
##
## Call:
## lm(formula = as.formula(paste(resp, "~", pred)), data = div.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.37111 -0.09140 0.03547 0.10045 0.32491
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.47373 0.16223 2.920 0.0128 *
## temp 0.04645 0.06227 0.746 0.4701
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1918 on 12 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.04431, Adjusted R-squared: -0.03533
## F-statistic: 0.5564 on 1 and 12 DF, p-value: 0.4701
##
##
## Model: pielou ~ slope
##
## Call:
## lm(formula = as.formula(paste(resp, "~", pred)), data = div.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.38721 -0.03211 0.02922 0.11506 0.30608
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.539835 0.189232 2.853 0.0145 *
## slope 0.001787 0.006675 0.268 0.7934
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1956 on 12 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.00594, Adjusted R-squared: -0.0769
## F-statistic: 0.07171 on 1 and 12 DF, p-value: 0.7934
##
##
## Model: pielou ~ feature
##
## Call:
## lm(formula = as.formula(paste(resp, "~", pred)), data = div.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.31627 -0.09741 0.00491 0.13028 0.29295
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.40128 0.11723 3.423 0.00505 **
## feature 0.06554 0.03761 1.742 0.10698
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1753 on 12 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.2019, Adjusted R-squared: 0.1354
## F-statistic: 3.036 on 1 and 12 DF, p-value: 0.107
##
##
## ### Response: richness ###
##
## Model: richness ~ depth
##
## Call:
## lm(formula = as.formula(paste(resp, "~", pred)), data = div.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19.442 -4.737 0.281 5.590 18.387
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 24.566428 5.682605 4.323 0.000827 ***
## depth -0.002825 0.003016 -0.937 0.366057
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.791 on 13 degrees of freedom
## Multiple R-squared: 0.06321, Adjusted R-squared: -0.008853
## F-statistic: 0.8772 on 1 and 13 DF, p-value: 0.3661
##
##
## Model: richness ~ sed_type
##
## Call:
## lm(formula = as.formula(paste(resp, "~", pred)), data = div.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.0445 -6.9838 0.9555 2.5466 14.9555
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.741 6.978 0.966 0.3517
## sed_type 3.061 1.545 1.981 0.0691 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.866 on 13 degrees of freedom
## Multiple R-squared: 0.2319, Adjusted R-squared: 0.1728
## F-statistic: 3.925 on 1 and 13 DF, p-value: 0.06913
##
##
## Model: richness ~ sed_consistency
##
## Call:
## lm(formula = as.formula(paste(resp, "~", pred)), data = div.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.0345 -5.5690 -0.0345 3.8966 13.9655
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.241 8.132 0.276 0.7872
## sed_consistency 6.931 3.088 2.244 0.0429 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.588 on 13 degrees of freedom
## Multiple R-squared: 0.2792, Adjusted R-squared: 0.2238
## F-statistic: 5.037 on 1 and 13 DF, p-value: 0.04286
##
##
## Model: richness ~ temp
##
## Call:
## lm(formula = as.formula(paste(resp, "~", pred)), data = div.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.1891 -5.2212 -0.1077 5.1200 19.2789
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.786 8.143 1.447 0.171
## temp 3.260 3.151 1.035 0.320
##
## Residual standard error: 9.724 on 13 degrees of freedom
## Multiple R-squared: 0.07608, Adjusted R-squared: 0.005004
## F-statistic: 1.07 on 1 and 13 DF, p-value: 0.3197
##
##
## Model: richness ~ slope
##
## Call:
## lm(formula = as.formula(paste(resp, "~", pred)), data = div.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.417 -9.330 1.716 6.623 13.704
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.3337 8.0831 0.907 0.381
## slope 0.4716 0.2922 1.614 0.131
##
## Residual standard error: 9.233 on 13 degrees of freedom
## Multiple R-squared: 0.1669, Adjusted R-squared: 0.1029
## F-statistic: 2.605 on 1 and 13 DF, p-value: 0.1305
##
##
## Model: richness ~ feature
##
## Call:
## lm(formula = as.formula(paste(resp, "~", pred)), data = div.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.037 -6.537 1.390 4.890 15.963
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 21.7513 6.0934 3.570 0.00343 **
## feature -0.7139 2.0163 -0.354 0.72896
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.07 on 13 degrees of freedom
## Multiple R-squared: 0.009552, Adjusted R-squared: -0.06664
## F-statistic: 0.1254 on 1 and 13 DF, p-value: 0.729
# Only sediment consistency has significant effect on alpha diversity evenness!
#Plot with smoothing regression to show trend...
depth.reg <- ggplot(div.df, aes(x = depth, y = shannon, colour = site)) +
geom_smooth(method = "lm", colour = "black", fill = "grey90") +
geom_point(size = 3) +
scale_colour_viridis_d(option = "magma", begin = 0.2, end = 0.8) +
xlab(bquote(Depth (m))) +
ylab("Shannon's H'") +
ylim(0,3.5) +
theme_bw()
depth.reg
## `geom_smooth()` using formula = 'y ~ x'
sediments.reg <- ggplot(div.df, aes(x = sed_consistency, y = shannon, colour = site)) +
geom_point(size = 3) +
scale_colour_viridis_d(option = "magma", begin = 0.2, end = 0.8) +
xlab("Primary sediment consistency (m)") +
ylab("Shannon's H'") +
ylim(0,3.5) +
theme_bw()
sediments.reg
# Run linear models and print summaries
anova_results <- lapply(responses, function(resp) {
cat("\n### Response:", resp, "###\n")
lapply(predictors, function(pred) {
cat("\nModel:", resp, "~", pred, "\n")
print(summary(aov(as.formula(paste(resp, "~", pred)), data = div.df)))
})})
##
## ### Response: shannon ###
##
## Model: shannon ~ depth
## Df Sum Sq Mean Sq F value Pr(>F)
## depth 1 0.849 0.8485 1.698 0.215
## Residuals 13 6.497 0.4998
##
## Model: shannon ~ sed_type
## Df Sum Sq Mean Sq F value Pr(>F)
## sed_type 1 3.387 3.387 11.12 0.00537 **
## Residuals 13 3.958 0.304
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Model: shannon ~ sed_consistency
## Df Sum Sq Mean Sq F value Pr(>F)
## sed_consistency 1 4.362 4.362 19 0.000774 ***
## Residuals 13 2.984 0.230
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Model: shannon ~ temp
## Df Sum Sq Mean Sq F value Pr(>F)
## temp 1 0.957 0.9567 1.947 0.186
## Residuals 13 6.389 0.4915
##
## Model: shannon ~ slope
## Df Sum Sq Mean Sq F value Pr(>F)
## slope 1 1.115 1.1153 2.327 0.151
## Residuals 13 6.230 0.4793
##
## Model: shannon ~ feature
## Df Sum Sq Mean Sq F value Pr(>F)
## feature 1 1.044 1.0441 2.154 0.166
## Residuals 13 6.302 0.4847
##
## ### Response: simps ###
##
## Model: simps ~ depth
## Df Sum Sq Mean Sq F value Pr(>F)
## depth 1 0.0940 0.09397 1.337 0.268
## Residuals 13 0.9135 0.07027
##
## Model: simps ~ sed_type
## Df Sum Sq Mean Sq F value Pr(>F)
## sed_type 1 0.5050 0.5050 13.06 0.00314 **
## Residuals 13 0.5025 0.0387
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Model: simps ~ sed_consistency
## Df Sum Sq Mean Sq F value Pr(>F)
## sed_consistency 1 0.5957 0.5957 18.81 0.000806 ***
## Residuals 13 0.4117 0.0317
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Model: simps ~ temp
## Df Sum Sq Mean Sq F value Pr(>F)
## temp 1 0.1327 0.13273 1.973 0.184
## Residuals 13 0.8747 0.06729
##
## Model: simps ~ slope
## Df Sum Sq Mean Sq F value Pr(>F)
## slope 1 0.0915 0.09148 1.298 0.275
## Residuals 13 0.9160 0.07046
##
## Model: simps ~ feature
## Df Sum Sq Mean Sq F value Pr(>F)
## feature 1 0.2403 0.24026 4.071 0.0648 .
## Residuals 13 0.7672 0.05902
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## ### Response: pielou ###
##
## Model: pielou ~ depth
## Df Sum Sq Mean Sq F value Pr(>F)
## depth 1 0.0360 0.03599 1.014 0.334
## Residuals 12 0.4259 0.03549
## 1 observation deleted due to missingness
##
## Model: pielou ~ sed_type
## Df Sum Sq Mean Sq F value Pr(>F)
## sed_type 1 0.0821 0.08209 2.594 0.133
## Residuals 12 0.3798 0.03165
## 1 observation deleted due to missingness
##
## Model: pielou ~ sed_consistency
## Df Sum Sq Mean Sq F value Pr(>F)
## sed_consistency 1 0.1524 0.1524 5.906 0.0317 *
## Residuals 12 0.3096 0.0258
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 1 observation deleted due to missingness
##
## Model: pielou ~ temp
## Df Sum Sq Mean Sq F value Pr(>F)
## temp 1 0.0205 0.02047 0.556 0.47
## Residuals 12 0.4415 0.03679
## 1 observation deleted due to missingness
##
## Model: pielou ~ slope
## Df Sum Sq Mean Sq F value Pr(>F)
## slope 1 0.0027 0.00274 0.072 0.793
## Residuals 12 0.4592 0.03826
## 1 observation deleted due to missingness
##
## Model: pielou ~ feature
## Df Sum Sq Mean Sq F value Pr(>F)
## feature 1 0.0933 0.09327 3.036 0.107
## Residuals 12 0.3687 0.03072
## 1 observation deleted due to missingness
##
## ### Response: richness ###
##
## Model: richness ~ depth
## Df Sum Sq Mean Sq F value Pr(>F)
## depth 1 84.1 84.09 0.877 0.366
## Residuals 13 1246.3 95.87
##
## Model: richness ~ sed_type
## Df Sum Sq Mean Sq F value Pr(>F)
## sed_type 1 308.5 308.52 3.925 0.0691 .
## Residuals 13 1021.9 78.61
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Model: richness ~ sed_consistency
## Df Sum Sq Mean Sq F value Pr(>F)
## sed_consistency 1 371.5 371.5 5.037 0.0429 *
## Residuals 13 958.9 73.8
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Model: richness ~ temp
## Df Sum Sq Mean Sq F value Pr(>F)
## temp 1 101.2 101.21 1.07 0.32
## Residuals 13 1229.2 94.55
##
## Model: richness ~ slope
## Df Sum Sq Mean Sq F value Pr(>F)
## slope 1 222.1 222.10 2.605 0.131
## Residuals 13 1108.3 85.25
##
## Model: richness ~ feature
## Df Sum Sq Mean Sq F value Pr(>F)
## feature 1 12.7 12.71 0.125 0.729
## Residuals 13 1317.7 101.36
#Explore the values combined for hard, medium, soft...
sed.plot <- ggplot(div.df, aes(x = sed_consistency, y = shannon, fill = as.factor(sed_consistency))) +
geom_boxplot(aes(fill = as.factor(sed_consistency))) +
geom_point(size = 3, aes(colour = site)) +
scale_colour_viridis_d(option = "magma", begin = 0.2, end = 0.8) +
scale_fill_manual(values = c("grey60", "grey90", "black"), guide = FALSE) +
ylab("Shannon's H'") +
xlab('')+
theme_bw()
sed.plot
## Warning: The `guide` argument in `scale_*()` cannot be `FALSE`. This was deprecated in
## ggplot2 3.3.4.
## ℹ Please use "none" instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
#Explore the values combined for hard, medium, soft...
sed.plot2 <- ggplot(div.df, aes(x = sed_type, y = shannon, fill = as.factor(sed_type))) +
geom_boxplot(aes(fill = as.factor(sed_type))) +
geom_point(size = 3, aes(colour = site)) +
scale_colour_viridis_d(option = "magma", begin = 0.2, end = 0.8) +
#scale_fill_manual(guide = FALSE) +
ylab("Shannon's H'") +
xlab('')+
theme_bw()
sed.plot2
# NOTE: When adjusting the depth bins there was no obvious difference
Summary of the alpha diversity: Only sediments had a significant effect on the within-side diversities. Hard substrates show the highest Shannon. Uliaga seamount is the most diverse site, and Umnak Canyon the lowest with a index of 0 with only 1 morphotype. There was no clear trend on feature, but a slight negative trend with depth.
# To calculate Beta diversity (across sites) pairwise dissimilarities/distances need to be calculated
# Prepare data: Convert morphotype data to presence-absence matrix
SPE.pa <- SPE2
SPE.pa[SPE.pa > 0] <- 1
# Calculate beta diversity using Bray-Curtis dissimilarity
bray.df <- vegdist(SPE.pa, method = "bray")
# Optionally, convert dissimilarity matrix to dataframe
bray.df <- as.data.frame(as.matrix(bray.df))
# Set the row names of the distance matrix to Dive Names
rownames(bray.df) <- data.c$Dive.Name
colnames(bray.df) <- data.c$Dive.Name
#bray.df <- cbind(bray.df, ENV2)
# The matrix can be further used to explore how distant sites are.
# 1) Are sites more similar that are closer to each other? GEOGRAPHIC DISTANCE
# Create a list to store the first coordinate values and Dive names for each Dive site
first_coordinates_list <- lapply(divesites, function(dive_site) {
# Extract the first coordinate values for the Dive site
first_coordinates <- data.a %>%
filter(Dive.Name == dive_site) %>%
slice(1) %>%
dplyr::select(Dive.Name, Latitude..deg., Longitude..deg.)
return(first_coordinates)
})
# Extract latitude and longitude values from the list of data frames
coordinates <- lapply(first_coordinates_list, function(df) df[, c("Longitude..deg.", "Latitude..deg.")])
# Convert the list of coordinates to a matrix
coordinates_matrix <- do.call(rbind, coordinates)
# Calculate distances between points using the distm function with Haversine formula (for km)
distance_matrix <- distm(coordinates_matrix)
rownames(distance_matrix) <- data.c$Dive.Name
colnames(distance_matrix) <- data.c$Dive.Name
# Print the distance matrix - if wanted
# print(distance_matrix)
heatmap(distance_matrix)
# Convert depth_distance_matrix to long format
geo_long <- as.data.frame(as.table(distance_matrix))
colnames(geo_long) <- c("Site1", "Site2", "Geo")
# Convert bray_curtis_matrix to long format
bray_long <- as.data.frame(as.table(as.matrix(bray.df)))
colnames(bray_long) <- c("Site1", "Site2", "Bray_Curtis")
# Merge the dataframes on Site1 and Site2
combined_df <- merge(geo_long, bray_long, by = c("Site1", "Site2"))
# Remove rows where Site1 == Site2 (comparisons with self)
combined_df <- combined_df[combined_df$Site1 != combined_df$Site2, ]
ggplot(combined_df, aes(x = Geo, y = Bray_Curtis)) +
geom_point(aes(color = Bray_Curtis)) + # Points colored by Bray-Curtis value
geom_smooth(method = "lm", se = TRUE, color = "black") + # Linear regression line
stat_cor(method = "pearson", label.x = 1500000, label.y = 0.55) + # Add correlation coefficient (r) and p-value
labs(x = "Geographic Distance (m)", y = "Bray-Curtis Dissimilarity") +
theme_minimal() +
scale_color_viridis_c() # Color scale for Bray-Curtis dissimilarity
## `geom_smooth()` using formula = 'y ~ x'
# 2) Are sites more similar that share similar depths? DEPTH DISTANCE
# Create an empty matrix to store the depth distances
depth_distance_matrix <- matrix(NA, nrow = nrow(data.c), ncol = nrow(data.c)) # using data.c as this has the average depths for each site ready
# Fill the matrix with depth differences
for (i in 1:nrow(data.c)) {
for (j in 1:nrow(data.c)) {
# Calculate the absolute difference in depths between locations i and j
depth_distance_matrix[i, j] <- abs(data.c$depth[i] - data.c$depth[j])
}}
# Set the row names of the distance matrix to Dive Names
rownames(depth_distance_matrix) <- data.c$Dive.Name
colnames(depth_distance_matrix) <- data.c$Dive.Name
# Convert depth_distance_matrix to long format
depth_long <- as.data.frame(as.table(depth_distance_matrix))
colnames(depth_long) <- c("Site1", "Site2", "Depth")
# Merge the dataframes on Site1 and Site2
combined_df2 <- merge(depth_long, bray_long, by = c("Site1", "Site2"))
# Remove rows where Site1 == Site2 (comparisons with self)
combined_df2 <- combined_df2[combined_df2$Site1 != combined_df2$Site2, ]
ggplot(combined_df2, aes(x = Depth, y = Bray_Curtis)) +
geom_point(aes(color = Bray_Curtis)) + # Points colored by Bray-Curtis value
geom_smooth(method = "lm", se = TRUE, color = "black") + # Linear regression line
stat_cor(method = "pearson", label.x = 2000, label.y = 0.55) + # Add correlation coefficient (r) and p-value. TEST: Linear regression, because the data is pairwaise (testing the relationship between two variables)
labs(x = "Depth Distance (m)", y = "Bray-Curtis Dissimilarity") +
theme_minimal() +
scale_color_viridis_c() # Color scale for Bray-Curtis dissimilarity
## `geom_smooth()` using formula = 'y ~ x'
Summary of Beta Diversity. Geographic and depth distance have an impact on the beta diversity. The similarity increases when the sites are geographically closer, and even stronger when the depth distance is considered.
pcoa.bray <- cmdscale(bray.df, k = 2, eig = T)
# print(pcoa.bray) #explore output if wanted
# extract axis positions for each site from cmdscale object and create a dataframe for plotting
pcoa.bray.plotting <- as.data.frame(pcoa.bray$points)
colnames(pcoa.bray.plotting) <- c("pcoa1", "pcoa2")
# Create alpha diversity dataframe, include info about environment and locations
pcoa.bray.plotting <- cbind(pcoa.bray.plotting, ENV2)
# calculate the proportion of variance in the data which is explained by the first two PCoA axes
pcoa.bray$eig[1]/(sum(pcoa.bray$eig))
## [1] 0.1780102
pcoa.bray$eig[2]/(sum(pcoa.bray$eig))
## [1] 0.1293653
pcoa.bray.plotting$site <- data.c$Dive.Name
pcoa.bray.plotting$watermass <- data.c$watermass
ggplot(pcoa.bray.plotting, aes(x = pcoa1, y = pcoa2, label=site, color=site, shape=watermass)) +
geom_point(size = 4) +
#scale_shape_manual(values = c(15, 16, 17, 18, 19))+
#ggtitle("PCA of Bray Curtis Dissimilarities") +
geom_text_repel(min.segment.length = 0, size=4, box.padding = 1) +
labs(col = "Site", shape = "Water Mass",
x = "PCoA1",
y = "PCoA2",
size = 5) +
theme_linedraw() +
# Confidence ellipses
# stat_ellipse(aes(col = current), level = 0.95, linetype = "dashed") +
theme(panel.grid = element_blank(),
legend.title = element_text(size = 10),
legend.text = element_text(size = 10),
axis.title.x = element_text (size = 14),
axis.title.y = element_text(size = 14),
legend.background = element_blank(),
legend.box.background= element_rect(colour="black"),
legend.position = "right")
# Test the significance of watermass and other factors on the distances
# TEST: Adonis2/PERMANOVA. Multivariate test for distance matrix as bray curtis e.g.
variables <- c("depth", "watermass", "feature", "slope","rugosity","eastness", "sed_consistency","lon","lat")
results <- lapply(variables, function(var) adonis2(as.formula(paste("bray.df ~", var)), data = data.c))
results
## [[1]]
## Permutation test for adonis under reduced model
## Permutation: free
## Number of permutations: 999
##
## adonis2(formula = as.formula(paste("bray.df ~", var)), data = data.c)
## Df SumOfSqs R2 F Pr(>F)
## Model 1 0.8436 0.1557 2.3974 0.001 ***
## Residual 13 4.5744 0.8443
## Total 14 5.4181 1.0000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## [[2]]
## Permutation test for adonis under reduced model
## Permutation: free
## Number of permutations: 999
##
## adonis2(formula = as.formula(paste("bray.df ~", var)), data = data.c)
## Df SumOfSqs R2 F Pr(>F)
## Model 4 2.3849 0.44019 1.9658 0.001 ***
## Residual 10 3.0331 0.55981
## Total 14 5.4181 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## [[3]]
## Permutation test for adonis under reduced model
## Permutation: free
## Number of permutations: 999
##
## adonis2(formula = as.formula(paste("bray.df ~", var)), data = data.c)
## Df SumOfSqs R2 F Pr(>F)
## Model 1 0.3557 0.06564 0.9133 0.634
## Residual 13 5.0624 0.93436
## Total 14 5.4181 1.00000
##
## [[4]]
## Permutation test for adonis under reduced model
## Permutation: free
## Number of permutations: 999
##
## adonis2(formula = as.formula(paste("bray.df ~", var)), data = data.c)
## Df SumOfSqs R2 F Pr(>F)
## Model 1 0.4233 0.07812 1.1017 0.299
## Residual 13 4.9948 0.92188
## Total 14 5.4181 1.00000
##
## [[5]]
## Permutation test for adonis under reduced model
## Permutation: free
## Number of permutations: 999
##
## adonis2(formula = as.formula(paste("bray.df ~", var)), data = data.c)
## Df SumOfSqs R2 F Pr(>F)
## Model 1 0.5206 0.09609 1.382 0.072 .
## Residual 13 4.8974 0.90391
## Total 14 5.4181 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## [[6]]
## Permutation test for adonis under reduced model
## Permutation: free
## Number of permutations: 999
##
## adonis2(formula = as.formula(paste("bray.df ~", var)), data = data.c)
## Df SumOfSqs R2 F Pr(>F)
## Model 1 0.4412 0.08142 1.1524 0.238
## Residual 13 4.9769 0.91858
## Total 14 5.4181 1.00000
##
## [[7]]
## Permutation test for adonis under reduced model
## Permutation: free
## Number of permutations: 999
##
## adonis2(formula = as.formula(paste("bray.df ~", var)), data = data.c)
## Df SumOfSqs R2 F Pr(>F)
## Model 1 0.4419 0.08156 1.1544 0.247
## Residual 13 4.9762 0.91844
## Total 14 5.4181 1.00000
##
## [[8]]
## Permutation test for adonis under reduced model
## Permutation: free
## Number of permutations: 999
##
## adonis2(formula = as.formula(paste("bray.df ~", var)), data = data.c)
## Df SumOfSqs R2 F Pr(>F)
## Model 1 0.5923 0.10932 1.5955 0.018 *
## Residual 13 4.8258 0.89068
## Total 14 5.4181 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## [[9]]
## Permutation test for adonis under reduced model
## Permutation: free
## Number of permutations: 999
##
## adonis2(formula = as.formula(paste("bray.df ~", var)), data = data.c)
## Df SumOfSqs R2 F Pr(>F)
## Model 1 0.5410 0.09986 1.4421 0.048 *
## Residual 13 4.8770 0.90014
## Total 14 5.4181 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
To continue with the ordination methods, the environmnetal and species data needs to be transformed. This is the case when count data is used, and many zeros are present e.g. Most commonly used are Hellinger transformation for species data, and standardisation for environmental data.
# Morphotype data
# Hellinger transformation
# SPE is the "full" data, so all annotations
SPE.hel <- decostand(SPE, method="hellinger")
plotNormalHistogram(SPE.hel) # checking the distribution
#SPE2 is the aggregated data, grouped by sites
SPE2.hel <- decostand(SPE2, method="hellinger")
plotNormalHistogram(SPE2.hel) # checking the distribution
# Hellinger pre-transformation of the species data
# Compute square root of relative abundances per site
SPE.hel3 <-decostand(SPE.fam, method="hellinger") # full data, not grouped by site
SPE.hel3$site <- data.b$Dive.Name
SPE.hel3 <- SPE.hel3[SPE.hel3$site != "2306_08", ]
SPE.hel3 <- SPE.hel3 %>% dplyr::select(-site) # remove the site column again
# Environmental data
# Standardization
env.z <- decostand(ENV, method="standardize") # After Borcard
env.z <- na.omit(env.z) # remove the rows with NAs, that is no values. This removes the dive EX2306 Dive 08 as there was no Backscatter data
env2.z <- decostand(ENV2, method= "standardize") # After Borcard
env2.z <- na.omit(env2.z) # remove the rows with NAs, that is no values. This removes the dive EX2306 Dive 08 as there was no Backscatter data
The cluster analysis is an exploratory data analysis, and can be used to group a set of data into clusters. First, I am exploring the species data (morphotype-level) to assess clustering patterns among different morphotypes per site. The dissimilarity between morphotypes is computed using different clustering methods to find the best fit. I am using the site grouped dataset (SPE2) because I want to see which sites share similar morphotype compositions.
# First: Species data clustering
# Used Borcard's suggestion to explore the species data
# Set seed for reproducibility
set.seed(123)
# Compute matrix of chord distance among morphotypes
spe.ch <- as.matrix(vegdist(SPE2.hel, "euc"))
rownames(spe.ch) <- rownames(SPE2.hel) # Ensure meaningful labels if available
colnames(spe.ch) <- rownames(SPE2.hel)
spe.ch <- as.dist(spe.ch)
# Compute different hierarchical clustering methods
spe.ch.single <- hclust(spe.ch, method = "single")
spe.ch.complete <- hclust(spe.ch, method = "complete")
spe.ch.UPGMA <- hclust(spe.ch, method = "average") # UPGMA (best cophenetic correlation)
spe.ch.ward <- hclust(spe.ch, method = "ward.D2")
# Plot dendrograms
par(mfrow = c(2,2)) # Arrange plots in a grid
plot(spe.ch.single, main = "Chord - Single linkage", cex = 0.8)
plot(spe.ch.complete, main = "Chord - Complete linkage", cex = 0.8)
plot(spe.ch.UPGMA, main = "Chord - UPGMA", cex = 0.8)
plot(spe.ch.ward, main = "Chord - Ward", cex = 0.8)
par(mfrow = c(1,1)) # Reset plot layout
# Compute cophenetic correlation coefficients to evaluate clustering methods
spe.ch.single.coph <- cophenetic(spe.ch.single)
spe.ch.comp.coph <- cophenetic(spe.ch.complete)
spe.ch.UPGMA.coph <- cophenetic(spe.ch.UPGMA)
spe.ch.ward.coph <- cophenetic(spe.ch.ward)
# Print correlation values
cor(spe.ch, spe.ch.single.coph) # Single Linkage
## [1] 0.7773492
cor(spe.ch, spe.ch.comp.coph) # Complete Linkage
## [1] 0.8243592
cor(spe.ch, spe.ch.UPGMA.coph) # UPGMA (expected highest)
## [1] 0.8797714
cor(spe.ch, spe.ch.ward.coph) # Ward's
## [1] 0.6844803
# Highest correlation found in the UPGMA clustering with 0.88
# Cluster validation using bootstrap resampling
spech.pv <- pvclust(t(SPE2.hel), method.hclust = "average", method.dist = "euc", parallel = TRUE)
## Creating a temporary cluster...done:
## socket cluster with 13 nodes on host 'localhost'
## Multiscale bootstrap... Done.
plot(spech.pv)
# Highlight clusters with high AU p-values (≥0.95 = significant)
pvrect(spech.pv, alpha = 0.95, pv = "au")
# Alternative: Determine optimal number of clusters using silhouette method
# Define a range of k values to test
k_values <- 2:14
sil_widths <- numeric(length(k_values))
# Compute silhouette width for each k
for (i in seq_along(k_values)) {
cluster_assignments <- cutree(spe.ch.UPGMA, k = k_values[i]) # Get cluster labels
sil <- silhouette(cluster_assignments, spe.ch) # Compute silhouette scores
sil_widths[i] <- mean(sil[, 3]) # Store average silhouette width
}
# Plot silhouette width vs. number of clusters
plot(k_values, sil_widths, type = "b", pch = 19, frame = FALSE,
xlab = "Number of Clusters (k)", ylab = "Average Silhouette Width",
main = "Optimal Cluster Selection via Silhouette Width")
abline(v = k_values[which.max(sil_widths)], col = "red", lty = 2) # Mark optimal k
cluster_membership <- cutree(spe.ch.UPGMA, k = 7)
table(cluster_membership)
## cluster_membership
## 1 2 3 4 5 6 7
## 5 2 1 3 1 1 2
# Final reordered dendrogram with chosen clusters
spe.chwo <- reorder.hclust(spe.ch.UPGMA, spe.ch)
plot(spe.chwo, hang = 0, xlab = "Cluster Groups", ylab = "Height", main = "Chord - UPGMA (Reordered)")
rect.hclust(spe.chwo, k = 7) # Highlight clusters (adjusted k to 7 according to the previous sillhouette plot)
#Rows are dive IDs:
#1) 2304_02
#2) 2304_05
#3) 2304_07
#4) 2304_08
#5 2306_01
#6 2306_03
#7 2306_04
#8 2306_05
#9 2306_06
#10 2306_07
#11 2306_08
#12 2306_12
#13 2306_14
#14 2306_15
#15 2306_18
Continuing with environmental data. Which sites share cluster together that share similar environmental conditions? Does this correspond to geography? How does it relate to the species clusters?
# Environmental data clustering analysis
# Ensure NA rows are removed for environmental data
# Compute the distance matrix using Euclidean distance
env.ch <-as.matrix(vegdist(na.omit(env2.z), "euc")) # Attach site names to object of class 'dist'/ Take out Dive 2306 08
rownames(env.ch) <- c("2304_02","2304_05", "2304_07" , "2304_08", "2306_01" ,"2306_03", "2306_04", "2306_05", "2306_06" ,"2306_07", "2306_12", "2306_14" ,"2306_15", "2306_18") #w/o Dive 2306 8 as this one has missing env data
colnames(env.ch) <- c("2304_02","2304_05", "2304_07" , "2304_08", "2306_01" ,"2306_03", "2306_04", "2306_05", "2306_06" ,"2306_07", "2306_12", "2306_14" ,"2306_15", "2306_18")
env.ch <- as.dist(env.ch)
# Compute agglomerative clustering methods
env.ch.single <- hclust(env.ch, method = "single")
env.ch.complete <- hclust(env.ch, method = "complete")
env.ch.UPGMA <- hclust(env.ch, method = "average") # UPGMA
env.ch.ward <- hclust(env.ch, method = "ward.D2")
# Plot dendrograms for different methods
par(mfrow = c(2, 2)) # Arrange plots in a grid
plot(env.ch.single, main = "Chord - Single linkage")
plot(env.ch.complete, main = "Chord - Complete linkage")
plot(env.ch.UPGMA, main = "Chord - UPGMA")
plot(env.ch.ward, main = "Chord - Ward")
par(mfrow = c(1, 1)) # Reset plot layout
# Compute cophenetic correlations to assess clustering method
env.ch.single.coph <- cophenetic(env.ch.single)
env.ch.comp.coph <- cophenetic(env.ch.complete)
env.ch.UPGMA.coph <- cophenetic(env.ch.UPGMA)
env.ch.ward.coph <- cophenetic(env.ch.ward)
# Print correlation values for each method
cor(env.ch, env.ch.single.coph) # Single Linkage
## [1] 0.676066
cor(env.ch, env.ch.comp.coph) # Complete Linkage
## [1] 0.7015525
cor(env.ch, env.ch.UPGMA.coph) # UPGMA (highest correlation)
## [1] 0.7392086
cor(env.ch, env.ch.ward.coph) # Ward's
## [1] 0.6750676
# Perform p-value bootstrapping to validate clusters (approximate unbiased p-values)
envch.pv <- pvclust(t(env2.z), method.hclust = "average", method.dist = "eucl", parallel = TRUE)
## Creating a temporary cluster...done:
## socket cluster with 13 nodes on host 'localhost'
## Multiscale bootstrap... Done.
plot(envch.pv)
pvrect(envch.pv, alpha = 0.95, pv = "au") # Highlight significant clusters (AU p-value ≥ 0.95)
# Reorder the dendrogram based on environmental dissimilarities
env.chwo <- reorder.hclust(env.ch.UPGMA, env.ch)
plot(env.chwo, hang = 0, xlab = "3 groups", ylab = "Height", main = "Chord - UPGMA (Reordered)")
rect.hclust(env.chwo, k = 3) # Highlight 3 clusters
To further explore clustering of species and how they group by similarity or dissimilarity across the sites I perfomed a PCA. Also a PCA on environmental data: To explore how environmental variables group together across the sites. Then a Procrustes Analysis: To assess if the species clustering and environmental clustering are significantly correlated. If yes, proceed with multivariate analysis.
# PCA on Hellinger-transformed species data (SPE.hel)
# Need to delete the Dive 08 as NA in env data
SPE.hel$site <- data.b$Dive.Name
SPE.hel <- SPE.hel[SPE.hel$site != "2306_08", ]
SPE.hel <- SPE.hel %>% dplyr::select(-site) # remove the site column again
# Now the morphotype for Dive 08 will be only zeros so I need to delete this column with na.omit
spe.pca <- prcomp(SPE.hel[, colSums(SPE.hel != 0) > 0], scale = TRUE) # You can scale if needed
# PCA plot for species data
biplot(spe.pca, main = "PCA of Species Data") # You can adjust the biplot options to visualize axes and vectors better
#screeplot(spe.pca)
# PCA on Environmental Data (env.z standardized)
env.pca <- prcomp( env.z[, sapply(env.z, is.numeric)], scale = TRUE) # exclude the previously added site and water mass and feature columns (not needed here anyways)
biplot(env.pca, main = "PCA of Environmental Data") # Similar to species PCA, biplot shows the principal components
#screeplot(env.pca)
Procrustes analysis tests whether the species composition (PCA on species data) and environmental variables (PCA on environmental data) are significantly correlated. If they are, it suggests that the species distributions are likely influenced by the environmental gradients in your dataset. A significant correlation means that the patterns in species composition are in some way “shaped” by the environmental factors, making CCA a suitable next step.
# 2. Perform Procrustes Analysis
#Test significance
env.Procrustes.sign <- protest(env.pca, spe.pca, scores="sites") # p < 0.001
env.Procrustes.sign
##
## Call:
## protest(X = env.pca, Y = spe.pca, scores = "sites")
##
## Procrustes Sum of Squares (m12 squared): 0.9017
## Correlation in a symmetric Procrustes rotation: 0.3135
## Significance: 0.001
##
## Permutation: free
## Number of permutations: 999
Procrustes shows a moderate correlation between species and env data. R = 0.31. Significance (p < 0.001) indicates that the relationship between the environmental and species data is not random. The result suggests that while the correlation is moderate, the association is still meaningful and statistically reliable. Proceed with multivariate analysis (CCA)
Using CCA (Canonical Correspondence Analysis) as this assumes unimodal relationships instead of linear. In nature, this is probably a more realistic assumption.
# Using species data (Hellinger transformed) of the full wide dataset
# 1) Run a full model
# This includes all variables that are possible to include (excluding site, lat, lon, watermass). Here it is very important that all NA's are removed, and rows are the same! To double check, use nrow on both df's
cca_full <- cca(SPE.hel ~ depth + slope + eastness + bs + rugosity + northness + sed_consistency, data = env.z, scale = TRUE) # as a reminder, dive 08 EX2306 is excluded as NA's in backscatter
anova(cca_full, by = "margin") #test each variables significance for the model
## Permutation test for cca under reduced model
## Marginal effects of terms
## Permutation: free
## Number of permutations: 999
##
## Model: cca(formula = SPE.hel ~ depth + slope + eastness + bs + rugosity + northness + sed_consistency, data = env.z, scale = TRUE)
## Df ChiSquare F Pr(>F)
## depth 1 0.714 9.4782 0.001 ***
## slope 1 0.483 6.4055 0.001 ***
## eastness 1 0.415 5.5046 0.001 ***
## bs 1 0.300 3.9853 0.001 ***
## rugosity 1 0.536 7.1101 0.001 ***
## northness 1 0.291 3.8687 0.001 ***
## sed_consistency 1 0.335 4.4467 0.001 ***
## Residual 420 31.640
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(cca_full) # The model is significant
## Permutation test for cca under reduced model
## Permutation: free
## Number of permutations: 999
##
## Model: cca(formula = SPE.hel ~ depth + slope + eastness + bs + rugosity + northness + sed_consistency, data = env.z, scale = TRUE)
## Df ChiSquare F Pr(>F)
## Model 7 3.46 6.5622 0.001 ***
## Residual 420 31.64
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Test the significance of individual canonical axes
anova(cca_full, by = "axis")
## Permutation test for cca under reduced model
## Forward tests for axes
## Permutation: free
## Number of permutations: 999
##
## Model: cca(formula = SPE.hel ~ depth + slope + eastness + bs + rugosity + northness + sed_consistency, data = env.z, scale = TRUE)
## Df ChiSquare F Pr(>F)
## CCA1 1 0.856 11.3696 0.001 ***
## CCA2 1 0.711 9.4418 0.001 ***
## CCA3 1 0.634 8.4169 0.001 ***
## CCA4 1 0.450 5.9716 0.001 ***
## CCA5 1 0.341 4.5271 0.001 ***
## CCA6 1 0.272 3.6115 0.001 ***
## CCA7 1 0.196 2.5969 1.000
## Residual 420 31.640
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(cca_full)
##
## Call:
## cca(formula = SPE.hel ~ depth + slope + eastness + bs + rugosity + northness + sed_consistency, data = env.z, scale = TRUE)
##
## Partitioning of scaled Chi-square:
## Inertia Proportion
## Total 35.10 1.00000
## Constrained 3.46 0.09859
## Unconstrained 31.64 0.90141
##
## Eigenvalues, and their contribution to the scaled Chi-square
##
## Importance of components:
## CCA1 CCA2 CCA3 CCA4 CCA5 CCA6 CCA7
## Eigenvalue 0.8565 0.71127 0.63407 0.44986 0.341034 0.272060 0.195634
## Proportion Explained 0.0244 0.02026 0.01806 0.01282 0.009716 0.007751 0.005574
## Cumulative Proportion 0.0244 0.04467 0.06273 0.07555 0.085263 0.093014 0.098587
## CA1 CA2 CA3 CA4 CA5 CA6 CA7
## Eigenvalue 0.92820 0.89690 0.88058 0.87528 0.86187 0.85492 0.82628
## Proportion Explained 0.02644 0.02555 0.02509 0.02494 0.02455 0.02436 0.02354
## Cumulative Proportion 0.12503 0.15058 0.17567 0.20061 0.22516 0.24952 0.27306
## CA8 CA9 CA10 CA11 CA12 CA13 CA14
## Eigenvalue 0.80171 0.7793 0.73245 0.6878 0.65562 0.62515 0.61543
## Proportion Explained 0.02284 0.0222 0.02087 0.0196 0.01868 0.01781 0.01753
## Cumulative Proportion 0.29590 0.3181 0.33897 0.3586 0.37724 0.39506 0.41259
## CA15 CA16 CA17 CA18 CA19 CA20 CA21
## Eigenvalue 0.60524 0.55668 0.52228 0.49339 0.48206 0.47576 0.45136
## Proportion Explained 0.01724 0.01586 0.01488 0.01406 0.01373 0.01355 0.01286
## Cumulative Proportion 0.42983 0.44569 0.46057 0.47463 0.48836 0.50192 0.51478
## CA22 CA23 CA24 CA25 CA26 CA27 CA28
## Eigenvalue 0.4387 0.42864 0.42077 0.41271 0.39748 0.39641 0.37059
## Proportion Explained 0.0125 0.01221 0.01199 0.01176 0.01132 0.01129 0.01056
## Cumulative Proportion 0.5273 0.53949 0.55148 0.56323 0.57456 0.58585 0.59641
## CA29 CA30 CA31 CA32 CA33 CA34
## Eigenvalue 0.342691 0.332219 0.329492 0.322186 0.311197 0.302382
## Proportion Explained 0.009763 0.009465 0.009387 0.009179 0.008866 0.008615
## Cumulative Proportion 0.606173 0.615638 0.625025 0.634204 0.643070 0.651685
## CA35 CA36 CA37 CA38 CA39 CA40
## Eigenvalue 0.299892 0.287447 0.277909 0.274403 0.269177 0.259115
## Proportion Explained 0.008544 0.008189 0.007918 0.007818 0.007669 0.007382
## Cumulative Proportion 0.660229 0.668418 0.676336 0.684154 0.691822 0.699205
## CA41 CA42 CA43 CA44 CA45 CA46
## Eigenvalue 0.255704 0.246995 0.239271 0.235706 0.234590 0.23062
## Proportion Explained 0.007285 0.007037 0.006817 0.006715 0.006683 0.00657
## Cumulative Proportion 0.706490 0.713526 0.720343 0.727059 0.733742 0.74031
## CA47 CA48 CA49 CA50 CA51 CA52
## Eigenvalue 0.224925 0.221429 0.215082 0.211413 0.205507 0.204427
## Proportion Explained 0.006408 0.006309 0.006128 0.006023 0.005855 0.005824
## Cumulative Proportion 0.746720 0.753029 0.759157 0.765180 0.771035 0.776859
## CA53 CA54 CA55 CA56 CA57 CA58
## Eigenvalue 0.203490 0.195685 0.195099 0.189266 0.1860 0.185257
## Proportion Explained 0.005797 0.005575 0.005558 0.005392 0.0053 0.005278
## Cumulative Proportion 0.782656 0.788231 0.793790 0.799182 0.8045 0.809760
## CA59 CA60 CA61 CA62 CA63 CA64
## Eigenvalue 0.182492 0.178831 0.172381 0.167678 0.167216 0.16357
## Proportion Explained 0.005199 0.005095 0.004911 0.004777 0.004764 0.00466
## Cumulative Proportion 0.814959 0.820054 0.824965 0.829743 0.834507 0.83917
## CA65 CA66 CA67 CA68 CA69 CA70
## Eigenvalue 0.160487 0.159301 0.158040 0.155975 0.155376 0.149038
## Proportion Explained 0.004572 0.004538 0.004503 0.004444 0.004427 0.004246
## Cumulative Proportion 0.843739 0.848277 0.852780 0.857224 0.861650 0.865896
## CA71 CA72 CA73 CA74 CA75 CA76
## Eigenvalue 0.148498 0.144828 0.142084 0.141582 0.139678 0.137450
## Proportion Explained 0.004231 0.004126 0.004048 0.004034 0.003979 0.003916
## Cumulative Proportion 0.870127 0.874253 0.878301 0.882335 0.886314 0.890230
## CA77 CA78 CA79 CA80 CA81 CA82
## Eigenvalue 0.132265 0.12918 0.124200 0.121700 0.118337 0.115229
## Proportion Explained 0.003768 0.00368 0.003538 0.003467 0.003371 0.003283
## Cumulative Proportion 0.893999 0.89768 0.901217 0.904685 0.908056 0.911339
## CA83 CA84 CA85 CA86 CA87 CA88
## Eigenvalue 0.114571 0.108790 0.104867 0.098667 0.097470 0.094529
## Proportion Explained 0.003264 0.003099 0.002988 0.002811 0.002777 0.002693
## Cumulative Proportion 0.914603 0.917702 0.920690 0.923501 0.926278 0.928971
## CA89 CA90 CA91 CA92 CA93 CA94
## Eigenvalue 0.09230 0.089326 0.086456 0.082352 0.080620 0.079806
## Proportion Explained 0.00263 0.002545 0.002463 0.002346 0.002297 0.002274
## Cumulative Proportion 0.93160 0.934146 0.936609 0.938955 0.941252 0.943526
## CA95 CA96 CA97 CA98 CA99 CA100
## Eigenvalue 0.077474 0.076696 0.076132 0.074435 0.072928 0.072053
## Proportion Explained 0.002207 0.002185 0.002169 0.002121 0.002078 0.002053
## Cumulative Proportion 0.945733 0.947918 0.950087 0.952208 0.954285 0.956338
## CA101 CA102 CA103 CA104 CA105 CA106
## Eigenvalue 0.069339 0.067555 0.067260 0.06633 0.064734 0.06422
## Proportion Explained 0.001975 0.001925 0.001916 0.00189 0.001844 0.00183
## Cumulative Proportion 0.958314 0.960238 0.962154 0.96404 0.965888 0.96772
## CA107 CA108 CA109 CA110 CA111 CA112
## Eigenvalue 0.060512 0.058473 0.057243 0.053652 0.050705 0.049336
## Proportion Explained 0.001724 0.001666 0.001631 0.001529 0.001445 0.001406
## Cumulative Proportion 0.969442 0.971108 0.972739 0.974267 0.975712 0.977118
## CA113 CA114 CA115 CA116 CA117 CA118
## Eigenvalue 0.047588 0.046038 0.045392 0.044616 0.042942 0.039477
## Proportion Explained 0.001356 0.001312 0.001293 0.001271 0.001223 0.001125
## Cumulative Proportion 0.978473 0.979785 0.981078 0.982349 0.983573 0.984697
## CA119 CA120 CA121 CA122 CA123 CA124
## Eigenvalue 0.038809 0.037800 0.035187 0.0322444 0.0308331 0.0302128
## Proportion Explained 0.001106 0.001077 0.001002 0.0009186 0.0008784 0.0008608
## Cumulative Proportion 0.985803 0.986880 0.987882 0.9888011 0.9896796 0.9905403
## CA125 CA126 CA127 CA128 CA129 CA130
## Eigenvalue 0.0296901 0.026256 0.0252285 0.0244832 0.02317 0.021869
## Proportion Explained 0.0008459 0.000748 0.0007188 0.0006975 0.00066 0.000623
## Cumulative Proportion 0.9913862 0.992134 0.9928530 0.9935505 0.99421 0.994834
## CA131 CA132 CA133 CA134 CA135
## Eigenvalue 0.0207268 0.0189054 0.0181229 0.0174626 0.0166323
## Proportion Explained 0.0005905 0.0005386 0.0005163 0.0004975 0.0004739
## Cumulative Proportion 0.9954240 0.9959627 0.9964790 0.9969765 0.9974503
## CA136 CA137 CA138 CA139 CA140
## Eigenvalue 0.0147257 0.0138336 0.0127439 0.0120723 0.0103017
## Proportion Explained 0.0004195 0.0003941 0.0003631 0.0003439 0.0002935
## Cumulative Proportion 0.9978699 0.9982640 0.9986271 0.9989710 0.9992645
## CA141 CA142 CA143 CA144 CA145
## Eigenvalue 0.009055 0.0066055 0.0048864 0.0039509 1.318e-03
## Proportion Explained 0.000258 0.0001882 0.0001392 0.0001126 3.754e-05
## Cumulative Proportion 0.999522 0.9997107 0.9998499 0.9999625 1.000e+00
##
## Accumulated constrained eigenvalues
## Importance of components:
## CCA1 CCA2 CCA3 CCA4 CCA5 CCA6 CCA7
## Eigenvalue 0.8565 0.7113 0.6341 0.4499 0.34103 0.27206 0.19563
## Proportion Explained 0.2475 0.2055 0.1832 0.1300 0.09855 0.07862 0.05653
## Cumulative Proportion 0.2475 0.4531 0.6363 0.7663 0.86484 0.94347 1.00000
# Function to ...
calculate_proportion_explained <- function(cca_model) {
constrained_variation <- cca_model$tot.chi - cca_model$CA$tot.chi
total_variation <- cca_model$tot.chi
proportion_explained <- constrained_variation / total_variation
return(proportion_explained)}
calculate_proportion_explained(cca_full) # How much does the full model explain? About 9.86% of the total variation in species composition. This sounds low, but in ecological, marine studies not uncommon. It just means a lot more is shaping the communities, = the unconstrained variation (e.g. species interactions, dispersal strategies etc.)
## [1] 0.09858745
# Check for collinearity of variables - Variance Inflation Factors
vif.cca(cca_full)
## depth slope eastness bs rugosity
## 1.461075 1.183490 1.561514 1.280600 1.617593
## northness sed_consistency
## 1.322708 1.302115
#VIF < 5: No serious collinearity; VIF 5–10: Moderate collinearity, consider reducing variables; VIF > 10: High collinearity, remove or combine correlated predictors - So all the environmental variables are good to use.
# Run a second full model, also including site and watermass (just to test their influence on the test)
# First add the watermass as a factor to env.z
data.b <- data.b[data.b$Dive.Name != "2306_08", ] # first need to also delete Dive 08 EX2306
env.z$site <- as.factor(data.b$Dive.Name) # Add the feature information
env.z$watermass <- as.factor(data.b$watermass) # Add the feature information
env.z$feature <- as.factor(data.b$feature) # Add the feature information
cca_full2 <- cca(SPE.hel ~ slope + eastness + bs + rugosity + sed_consistency + northness + depth + watermass, data = env.z, scale = TRUE)
vif.cca(cca_full2)
## slope eastness bs rugosity
## 2.128915 3.236612 1.705460 10.504427
## sed_consistency northness depth watermassANSC
## 1.623899 1.522587 19.069032 8.398055
## watermassPDW watermassPDW/LCDW watermassPSIW
## 3.760601 17.006028 7.521306
# NOTE: Watermass is categorical, so it appears as individual variables here.
# Depth and watermass are highly correlated - which is to be expected, but won't be used in addition to watermass anyways.But we can check if it adds additional information to the model:
anova(cca_full2, by = "terms")
## Permutation test for cca under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
##
## Model: cca(formula = SPE.hel ~ slope + eastness + bs + rugosity + sed_consistency + northness + depth + watermass, data = env.z, scale = TRUE)
## Df ChiSquare F Pr(>F)
## slope 1 0.5330 7.5354 0.001 ***
## eastness 1 0.5233 7.3994 0.001 ***
## bs 1 0.4219 5.9648 0.001 ***
## rugosity 1 0.5566 7.8692 0.001 ***
## sed_consistency 1 0.4051 5.7279 0.001 ***
## northness 1 0.3065 4.3341 0.001 ***
## depth 1 0.7140 10.0953 0.001 ***
## watermass 4 2.2169 7.8360 0.001 ***
## Residual 416 29.4227
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
coef(cca_full2) # coefficients show how much each environmental variable contributes to each canonical axis (CCA1, CCA2, etc.), which represent gradients in species composition. Larger absolute values indicate stronger relationships between the variable and the axis. Positive/negative values show the direction of the relationship.
## CCA1 CCA2 CCA3 CCA4 CCA5
## slope 0.003046283 0.07619436 -0.02444276 0.20700077 -0.45376598
## eastness -0.034003582 0.01843548 -0.02245020 0.03469815 -0.15048615
## bs 0.003579928 0.02156912 -0.01572393 0.05604582 -0.08394914
## rugosity -0.004304628 -0.15690426 0.20063940 -0.16046266 0.52476327
## sed_consistency -0.013532912 -0.05553178 0.10277548 -0.01944144 -0.31483791
## northness 0.018734916 -0.02479359 0.03650233 0.06076624 0.22421100
## depth -0.040238619 0.92673831 -1.38652077 -0.91582236 1.15732423
## watermassANSC 2.427885399 1.42073726 -1.23473887 -3.21901654 2.91413419
## watermassPDW -0.285408697 3.65497172 2.18743869 -0.89810151 0.24238639
## watermassPDW/LCDW -0.100495161 -0.96906124 0.84886983 -1.06846075 -2.39744927
## watermassPSIW 0.051714043 0.79122733 -0.28004008 -3.64077285 2.02298695
## CCA6 CCA7 CCA8 CCA9 CCA10
## slope -0.823454227 -0.6521524 0.63005192 -0.2569045 0.2453663
## eastness 0.008120634 0.7333346 0.27019300 1.0848696 -0.1628281
## bs -0.291936700 0.3461386 0.02578135 0.3410311 0.3320855
## rugosity 1.383543416 0.3993149 0.59968508 1.4918996 -1.4746398
## sed_consistency 0.213184176 0.7214802 0.61120849 -0.8641202 0.3237361
## northness 0.205112897 -0.3864530 0.40446483 0.1381029 0.9566287
## depth -2.022439192 -0.8825660 0.05618539 -1.5459913 1.5089164
## watermassANSC 1.192352836 -0.7814902 -0.01502720 2.0775615 -1.4121719
## watermassPDW 2.574846966 -2.4408662 -0.04174918 1.1587748 -1.5393543
## watermassPDW/LCDW 5.023612682 0.3708254 0.21581445 3.0727194 -4.1527051
## watermassPSIW -1.202364424 -2.6740830 -0.37907830 -0.6328968 -0.4726168
## CCA11
## slope -0.3578369
## eastness -1.1076067
## bs 1.0414786
## rugosity 1.5662564
## sed_consistency -0.1561345
## northness 0.3520567
## depth -1.8398159
## watermassANSC 4.0886334
## watermassPDW 5.8650142
## watermassPDW/LCDW 7.4747254
## watermassPSIW 2.6916542
calculate_proportion_explained(cca_full2) # The second full model, that includes also water mass and site, explains much more (16%) - which is to be expected.
## [1] 0.1617468
Summary so far from the two full models: Depth and water masses are the dominant drivers of community structure. Rugosity & slope also play key roles, but to a lesser extent. Each water mass level affects species composition differently, which is why VIF listed them separately. Next I will explore the conditioned models, to explore each variable separately
# All variables to check for a conditioned model. For fun also including site here
env_vars <- c("depth", "site", "slope", "eastness", "rugosity", "bs", "sed_consistency", "feature", "watermass")
# Create a list to store CCA results
cca_results <- list()
# Run CCA for each variable as a condition
for (var in env_vars) {
formula <- as.formula(paste("SPE.hel ~", paste(setdiff(env_vars, var), collapse = " + "),
"+ Condition(", var, ")"))
cca_results[[var]] <- cca(formula, data = env.z, scale = TRUE)
}
##
## Some constraints or conditions were aliased because they were redundant. This
## can happen if terms are linearly dependent (collinear): 'feature2', 'feature3',
## 'feature4', 'feature5', 'watermassANSC', 'watermassPDW', 'watermassPDW/LCDW',
## 'watermassPSIW'
##
## Some constraints or conditions were aliased because they were redundant. This
## can happen if terms are linearly dependent (collinear): 'feature2', 'feature3',
## 'feature4', 'feature5', 'watermassANSC', 'watermassPDW', 'watermassPDW/LCDW',
## 'watermassPSIW'
##
## Some constraints or conditions were aliased because they were redundant. This
## can happen if terms are linearly dependent (collinear): 'feature2', 'feature3',
## 'feature4', 'feature5', 'watermassANSC', 'watermassPDW', 'watermassPDW/LCDW',
## 'watermassPSIW'
##
## Some constraints or conditions were aliased because they were redundant. This
## can happen if terms are linearly dependent (collinear): 'feature2', 'feature3',
## 'feature4', 'feature5', 'watermassANSC', 'watermassPDW', 'watermassPDW/LCDW',
## 'watermassPSIW'
##
## Some constraints or conditions were aliased because they were redundant. This
## can happen if terms are linearly dependent (collinear): 'feature2', 'feature3',
## 'feature4', 'feature5', 'watermassANSC', 'watermassPDW', 'watermassPDW/LCDW',
## 'watermassPSIW'
##
## Some constraints or conditions were aliased because they were redundant. This
## can happen if terms are linearly dependent (collinear): 'feature2', 'feature3',
## 'feature4', 'feature5', 'watermassANSC', 'watermassPDW', 'watermassPDW/LCDW',
## 'watermassPSIW'
##
## Some constraints or conditions were aliased because they were redundant. This
## can happen if terms are linearly dependent (collinear): 'feature2', 'feature3',
## 'feature4', 'feature5', 'watermassANSC', 'watermassPDW', 'watermassPDW/LCDW',
## 'watermassPSIW'
##
## Some constraints or conditions were aliased because they were redundant. This
## can happen if terms are linearly dependent (collinear): 'site2306_01',
## 'site2306_07', 'site2306_14', 'site2306_18', 'watermassANSC', 'watermassPDW',
## 'watermassPDW/LCDW', 'watermassPSIW'
##
## Some constraints or conditions were aliased because they were redundant. This
## can happen if terms are linearly dependent (collinear): 'site2304_08',
## 'site2306_04', 'site2306_06', 'site2306_18', 'feature2', 'feature3',
## 'feature4', 'feature5'
# Now extract and print the variance explained (inertia) for each model
for (var in env_vars) {
inertia <- cca_results[[var]]$CCA$tot.chi # Total inertia from the CCA model
constrained_inertia <- cca_results[[var]]$CCA$eig[1] # Inertia explained by the constrained axis
variation_explained <- (constrained_inertia / inertia) * 100
cat("Variation explained by", var, ":", variation_explained, "%\n") # Percent (explained) inertia
}
## Variation explained by depth : 11.91178 %
## Variation explained by site : 30.31318 %
## Variation explained by slope : 11.44967 %
## Variation explained by eastness : 11.43797 %
## Variation explained by rugosity : 11.57756 %
## Variation explained by bs : 11.32152 %
## Variation explained by sed_consistency : 11.24673 %
## Variation explained by feature : 14.20353 %
## Variation explained by watermass : 16.49963 %
A bit of explanation to the above: The variation explained differs from the results from the previous ANOVA. This is because the CCA approach looks at the total variation explained by all the predictors and how each variable explains variation in the context of all others. When conditioning on one variable, the variation explained by that variable can appear smaller because some of the variation is explained by the other variables, and some is shared or redistributed among them. The ANOVA approach tests the marginal significance of each variable independently, so depth might show strong significance and low p-value because it independently explains a lot of variation in the species data. Basically:
Now we will run the model we want to plot:
# Remove all variables that are not added based on the results above. I do not include watermass, feature, site (because that doesn't make sense to include if I want to see the clustering, should not bias the ordination, was just used to test the effect. And watermass and feature are categorical, so each mass has a different influence). Also bs and eastness, northness because they have a very weak influence (see the coef results from above, first two CCA axes. <0.04 (or -0.04) in at least one of the axes. The other variables are above that, so I decided to only take those. When I run the model and plotted it including the above variables, they are very close to the center too, as expected.
# Since this is the case, we could include Dive EX2306 08 again, since we don't end up using backscatter. I decided not to, as it doesn't change the outcome a lot, this dive is just missing from the CCA. I run this separately just to confirm, and the clustering and statistics are similar. Dive 08 clusters also with their respective water mass - PDW - and is close to the other PDW site, Quinn Seamount (EX2306 04), but also close to the other seamounts, as Quinn too. PDW and PSIW are a bit overlapping in both models.
cca_model <- cca(SPE.hel ~ slope + rugosity + sed_consistency + depth, data = env.z, scale = TRUE)
calculate_proportion_explained(cca_model)
## [1] 0.06735077
scores.cca <- vegan::scores(cca_model, display=c("sites","cn","bp","sp")) # sites, centroids, biplot, species
cca.sites <- data.frame(scores.cca$sites)
cca.biplot <- data.frame(scores.cca$biplot)
cca.species <- data.frame(scores.cca$species) # get all morphotypes
cca.species <- cca.species[which(abs(cca.species$CCA1) > 0.05 | abs(cca.species$CCA2) > 0.05),] # filter species based on scores
#add sites name
cca.sites$site <- data.b$Dive.Name # Add the feature information
cca.sites$feature <- as.factor(data.b$feature) # Add the feature information
cca.sites$watermass <- as.factor(data.b$watermass) # Add the feature information
cca.sites$sed_consistency <- as.factor(data.b$sed_consistency) # Add the feature information
# Convert row names to a column
cca.species$Morphotype <- rownames(cca.species)
# Merge with taxa_df using cleaned species names
cca_full_df <- merge(cca.species, taxa, by = "Morphotype")
#Maybe add only significant species:
# Step 1: Run envfit to test significance of species with CCA axes
species_fit <- envfit(cca_model, SPE.hel, perm = 999)
# Step 2: Extract species with significant p-values
significant_species <- species_fit$vectors$pvals < 0.05
significant_species_names <- rownames(species_fit$vectors$arrows)[significant_species]
# Step 3: Filter your species data (cca_full_df) to include only significant species
cca_full_df <- cca_full_df[cca_full_df$Morphotype %in% significant_species_names, ]
# Plotting the biplot. I am using water mass as shape, since they were strongly significant. And also explained the highest variation in the model.
plot.cca <- ggplot(cca.sites, aes(x = CCA1, y = CCA2)) +
geom_point(aes(x = CCA1, y = CCA2,
col = site, shape = watermass, alpha = 0.9),
size = 3) +
scale_alpha(range = c(0, 1), guide = "none") + # Hide alpha from the legend
geom_hline(yintercept=0, linetype="dotted") +
geom_vline(xintercept=0, linetype="dotted") +
geom_segment(data = cca.biplot,
aes(x = 0, xend = CCA1*2, y = 0, yend = CCA2*2),
arrow=arrow(length=unit(0.01,"npc")),
color = "#C20000") +
geom_text(data = cca.biplot,
aes(x = CCA1*2.5, y = CCA2*2.5, label=rownames(cca.biplot)),
size = 4,
color = "#C20000") +
stat_ellipse(aes(fill=watermass), level = 0.95, alpha=0.3, linetype = "dashed") +
geom_text(data = cca_full_df, aes(x = CCA1, y = CCA2, label = Morphotype),
position = position_jitter(width = 0.5, height = 0.5), size = 3, color = "black") + # Jitter text
labs(col = "Site", shape = "Water Mass or Current",
x = "CCA1",
y = "CCA2",
size = 5) +
theme_linedraw() +
theme(panel.grid = element_blank(),
legend.title = element_text(size = 10),
legend.text = element_text(size = 10),
axis.title.x = element_text (size = 14),
axis.title.y = element_text(size = 14),
legend.background = element_blank(),
legend.box.background= element_rect(colour="black"),
legend.position = "right")
plot.cca
Since the morphotype assignments might also influence the analysis (as for example more sponges are broadly identified to genus or family, and corals are better resolved in that context), lets also do the same on the family level data.
# First test the full model as above
# 1) Run a full model
# This includes all variables that are possible to include (excluding site, lat, lon, watermass). Here it is very important that all NA's are removed, and rows are the same! To double check, use nrow on both df's
cca_full_family <- cca(SPE.hel3 ~ depth + slope + eastness + bs + rugosity + northness + sed_consistency, data = env.z, scale = TRUE)
anova(cca_full, by = "margin") #test each variables significance for the model
## Permutation test for cca under reduced model
## Marginal effects of terms
## Permutation: free
## Number of permutations: 999
##
## Model: cca(formula = SPE.hel ~ depth + slope + eastness + bs + rugosity + northness + sed_consistency, data = env.z, scale = TRUE)
## Df ChiSquare F Pr(>F)
## depth 1 0.714 9.4782 0.001 ***
## slope 1 0.483 6.4055 0.001 ***
## eastness 1 0.415 5.5046 0.001 ***
## bs 1 0.300 3.9853 0.001 ***
## rugosity 1 0.536 7.1101 0.001 ***
## northness 1 0.291 3.8687 0.001 ***
## sed_consistency 1 0.335 4.4467 0.001 ***
## Residual 420 31.640
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(cca_full)
## Permutation test for cca under reduced model
## Permutation: free
## Number of permutations: 999
##
## Model: cca(formula = SPE.hel ~ depth + slope + eastness + bs + rugosity + northness + sed_consistency, data = env.z, scale = TRUE)
## Df ChiSquare F Pr(>F)
## Model 7 3.46 6.5622 0.001 ***
## Residual 420 31.64
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Test the significance of individual canonical axes
anova(cca_full_family, by = "axis")
## Permutation test for cca under reduced model
## Forward tests for axes
## Permutation: free
## Number of permutations: 999
##
## Model: cca(formula = SPE.hel3 ~ depth + slope + eastness + bs + rugosity + northness + sed_consistency, data = env.z, scale = TRUE)
## Df ChiSquare F Pr(>F)
## CCA1 1 0.5048 28.8098 0.001 ***
## CCA2 1 0.3297 18.8177 0.001 ***
## CCA3 1 0.2477 14.1373 0.001 ***
## CCA4 1 0.1575 8.9872 0.001 ***
## CCA5 1 0.1023 5.8356 0.001 ***
## CCA6 1 0.0621 3.5426 0.001 ***
## CCA7 1 0.0395 2.2534 1.000
## Residual 420 7.3598
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(cca_full_family)
##
## Call:
## cca(formula = SPE.hel3 ~ depth + slope + eastness + bs + rugosity + northness + sed_consistency, data = env.z, scale = TRUE)
##
## Partitioning of scaled Chi-square:
## Inertia Proportion
## Total 8.803 1.000
## Constrained 1.444 0.164
## Unconstrained 7.360 0.836
##
## Eigenvalues, and their contribution to the scaled Chi-square
##
## Importance of components:
## CCA1 CCA2 CCA3 CCA4 CCA5 CCA6 CCA7
## Eigenvalue 0.50484 0.32975 0.24773 0.15749 0.10226 0.062079 0.039486
## Proportion Explained 0.05735 0.03746 0.02814 0.01789 0.01162 0.007052 0.004485
## Cumulative Proportion 0.05735 0.09480 0.12294 0.14083 0.15245 0.159500 0.163985
## CA1 CA2 CA3 CA4 CA5 CA6 CA7
## Eigenvalue 0.71183 0.64385 0.61809 0.53605 0.46412 0.40354 0.3962
## Proportion Explained 0.08086 0.07314 0.07021 0.06089 0.05272 0.04584 0.0450
## Cumulative Proportion 0.24484 0.31798 0.38819 0.44908 0.50180 0.54764 0.5926
## CA8 CA9 CA10 CA11 CA12 CA13 CA14
## Eigenvalue 0.38396 0.3416 0.29900 0.28118 0.26221 0.23971 0.2227
## Proportion Explained 0.04361 0.0388 0.03396 0.03194 0.02978 0.02723 0.0253
## Cumulative Proportion 0.63626 0.6751 0.70902 0.74096 0.77075 0.79798 0.8233
## CA15 CA16 CA17 CA18 CA19 CA20 CA21
## Eigenvalue 0.19818 0.18780 0.16911 0.1514 0.14725 0.13287 0.12051
## Proportion Explained 0.02251 0.02133 0.01921 0.0172 0.01673 0.01509 0.01369
## Cumulative Proportion 0.84579 0.86712 0.88633 0.9035 0.92025 0.93535 0.94903
## CA22 CA23 CA24 CA25 CA26 CA27
## Eigenvalue 0.1118 0.09387 0.083024 0.062408 0.04305 0.032731
## Proportion Explained 0.0127 0.01066 0.009431 0.007089 0.00489 0.003718
## Cumulative Proportion 0.9617 0.97239 0.981823 0.988912 0.99380 0.997521
## CA28
## Eigenvalue 0.021826
## Proportion Explained 0.002479
## Cumulative Proportion 1.000000
##
## Accumulated constrained eigenvalues
## Importance of components:
## CCA1 CCA2 CCA3 CCA4 CCA5 CCA6 CCA7
## Eigenvalue 0.5048 0.3297 0.2477 0.1575 0.10226 0.06208 0.03949
## Proportion Explained 0.3497 0.2284 0.1716 0.1091 0.07083 0.04300 0.02735
## Cumulative Proportion 0.3497 0.5781 0.7497 0.8588 0.92965 0.97265 1.00000
calculate_proportion_explained(cca_full_family)
## [1] 0.1639855
vif.cca(cca_full_family) # checking again the collinearity, but should be same as before
## depth slope eastness bs rugosity
## 1.440203 1.184746 1.570254 1.302130 1.616738
## northness sed_consistency
## 1.328061 1.306054
# Run a second full model, also including site and watermass (just to test their influence on the test)
# First add the watermass as a factor to env.z
env.z$site <- as.factor(data.b$Dive.Name) # Add the feature information
env.z$watermass <- as.factor(data.b$watermass) # Add the feature information
env.z$feature <- as.factor(data.b$feature) # Add the feature information
cca_full2_family <- cca(SPE.hel3 ~ slope + eastness + bs + rugosity + sed_consistency + northness + depth + watermass, data = env.z, scale = TRUE)
anova(cca_full2_family, by = "terms")
## Permutation test for cca under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
##
## Model: cca(formula = SPE.hel3 ~ slope + eastness + bs + rugosity + sed_consistency + northness + depth + watermass, data = env.z, scale = TRUE)
## Df ChiSquare F Pr(>F)
## slope 1 0.2415 15.0588 0.001 ***
## eastness 1 0.2510 15.6515 0.001 ***
## bs 1 0.1410 8.7924 0.001 ***
## rugosity 1 0.1707 10.6436 0.001 ***
## sed_consistency 1 0.2428 15.1401 0.001 ***
## northness 1 0.0969 6.0442 0.001 ***
## depth 1 0.2998 18.6959 0.001 ***
## watermass 4 0.6890 10.7412 0.001 ***
## Residual 416 6.6708
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
coef(cca_full2) # coefficients show how much each environmental variable contributes to each canonical axis (CCA1, CCA2, etc.), which represent gradients in species composition. Larger absolute values indicate stronger relationships between the variable and the axis. Positive/negative values show the direction of the relationship.
## CCA1 CCA2 CCA3 CCA4 CCA5
## slope 0.003046283 0.07619436 -0.02444276 0.20700077 -0.45376598
## eastness -0.034003582 0.01843548 -0.02245020 0.03469815 -0.15048615
## bs 0.003579928 0.02156912 -0.01572393 0.05604582 -0.08394914
## rugosity -0.004304628 -0.15690426 0.20063940 -0.16046266 0.52476327
## sed_consistency -0.013532912 -0.05553178 0.10277548 -0.01944144 -0.31483791
## northness 0.018734916 -0.02479359 0.03650233 0.06076624 0.22421100
## depth -0.040238619 0.92673831 -1.38652077 -0.91582236 1.15732423
## watermassANSC 2.427885399 1.42073726 -1.23473887 -3.21901654 2.91413419
## watermassPDW -0.285408697 3.65497172 2.18743869 -0.89810151 0.24238639
## watermassPDW/LCDW -0.100495161 -0.96906124 0.84886983 -1.06846075 -2.39744927
## watermassPSIW 0.051714043 0.79122733 -0.28004008 -3.64077285 2.02298695
## CCA6 CCA7 CCA8 CCA9 CCA10
## slope -0.823454227 -0.6521524 0.63005192 -0.2569045 0.2453663
## eastness 0.008120634 0.7333346 0.27019300 1.0848696 -0.1628281
## bs -0.291936700 0.3461386 0.02578135 0.3410311 0.3320855
## rugosity 1.383543416 0.3993149 0.59968508 1.4918996 -1.4746398
## sed_consistency 0.213184176 0.7214802 0.61120849 -0.8641202 0.3237361
## northness 0.205112897 -0.3864530 0.40446483 0.1381029 0.9566287
## depth -2.022439192 -0.8825660 0.05618539 -1.5459913 1.5089164
## watermassANSC 1.192352836 -0.7814902 -0.01502720 2.0775615 -1.4121719
## watermassPDW 2.574846966 -2.4408662 -0.04174918 1.1587748 -1.5393543
## watermassPDW/LCDW 5.023612682 0.3708254 0.21581445 3.0727194 -4.1527051
## watermassPSIW -1.202364424 -2.6740830 -0.37907830 -0.6328968 -0.4726168
## CCA11
## slope -0.3578369
## eastness -1.1076067
## bs 1.0414786
## rugosity 1.5662564
## sed_consistency -0.1561345
## northness 0.3520567
## depth -1.8398159
## watermassANSC 4.0886334
## watermassPDW 5.8650142
## watermassPDW/LCDW 7.4747254
## watermassPSIW 2.6916542
# See how much variation each variable explains
cca_results <- list()
# Run CCA for each variable as a condition
for (var in env_vars) {
formula <- as.formula(paste("SPE.hel3 ~", paste(setdiff(env_vars, var), collapse = " + "), "+ Condition(", var, ")"))
cca_results[[var]] <- cca(formula, data = env.z, scale = TRUE)
}
##
## Some constraints or conditions were aliased because they were redundant. This
## can happen if terms are linearly dependent (collinear): 'feature2', 'feature3',
## 'feature4', 'feature5', 'watermassANSC', 'watermassPDW', 'watermassPDW/LCDW',
## 'watermassPSIW'
##
## Some constraints or conditions were aliased because they were redundant. This
## can happen if terms are linearly dependent (collinear): 'feature2', 'feature3',
## 'feature4', 'feature5', 'watermassANSC', 'watermassPDW', 'watermassPDW/LCDW',
## 'watermassPSIW'
##
## Some constraints or conditions were aliased because they were redundant. This
## can happen if terms are linearly dependent (collinear): 'feature2', 'feature3',
## 'feature4', 'feature5', 'watermassANSC', 'watermassPDW', 'watermassPDW/LCDW',
## 'watermassPSIW'
##
## Some constraints or conditions were aliased because they were redundant. This
## can happen if terms are linearly dependent (collinear): 'feature2', 'feature3',
## 'feature4', 'feature5', 'watermassANSC', 'watermassPDW', 'watermassPDW/LCDW',
## 'watermassPSIW'
##
## Some constraints or conditions were aliased because they were redundant. This
## can happen if terms are linearly dependent (collinear): 'feature2', 'feature3',
## 'feature4', 'feature5', 'watermassANSC', 'watermassPDW', 'watermassPDW/LCDW',
## 'watermassPSIW'
##
## Some constraints or conditions were aliased because they were redundant. This
## can happen if terms are linearly dependent (collinear): 'feature2', 'feature3',
## 'feature4', 'feature5', 'watermassANSC', 'watermassPDW', 'watermassPDW/LCDW',
## 'watermassPSIW'
##
## Some constraints or conditions were aliased because they were redundant. This
## can happen if terms are linearly dependent (collinear): 'feature2', 'feature3',
## 'feature4', 'feature5', 'watermassANSC', 'watermassPDW', 'watermassPDW/LCDW',
## 'watermassPSIW'
##
## Some constraints or conditions were aliased because they were redundant. This
## can happen if terms are linearly dependent (collinear): 'site2306_01',
## 'site2306_07', 'site2306_14', 'site2306_18', 'watermassANSC', 'watermassPDW',
## 'watermassPDW/LCDW', 'watermassPSIW'
##
## Some constraints or conditions were aliased because they were redundant. This
## can happen if terms are linearly dependent (collinear): 'site2304_08',
## 'site2306_04', 'site2306_06', 'site2306_18', 'feature2', 'feature3',
## 'feature4', 'feature5'
# Now extract and print the variance explained (inertia) for each model
for (var in env_vars) {
inertia <- cca_results[[var]]$CCA$tot.chi # Total inertia from the CCA model
constrained_inertia <- cca_results[[var]]$CCA$eig[1] # Inertia explained by the constrained axis
variation_explained <- (constrained_inertia / inertia) * 100
cat("Variation explained by", var, ":", variation_explained, "%\n") # Percent (explained) inertia
}
## Variation explained by depth : 18.99515 %
## Variation explained by site : 32.54604 %
## Variation explained by slope : 20.9424 %
## Variation explained by eastness : 18.50835 %
## Variation explained by rugosity : 18.82024 %
## Variation explained by bs : 20.3314 %
## Variation explained by sed_consistency : 15.76682 %
## Variation explained by feature : 20.29565 %
## Variation explained by watermass : 20.68799 %
# Reduced model for the biplot
cca_model_family <- cca(SPE.hel3 ~ slope + rugosity + sed_consistency + depth, data = env.z, scale = TRUE)
calculate_proportion_explained(cca_model_family)
## [1] 0.1196027
scores.cca <- vegan::scores(cca_model_family, display=c("sites","cn","bp","sp")) # sites, centroids, biplot, species
cca.sites <- data.frame(scores.cca$sites)
cca.biplot <- data.frame(scores.cca$biplot)
cca.species <- data.frame(scores.cca$species) # get all families
cca.species <- cca.species[which(abs(cca.species$CCA1) > 0.05 | abs(cca.species$CCA2) > 0.05),] # filter species based on scores
#add sites name
cca.sites$site <- data.b$Dive.Name # Add the feature information
cca.sites$feature <- as.factor(data.b$feature) # Add the feature information
cca.sites$watermass <- as.factor(data.b$watermass) # Add the feature information
cca.sites$sed_consistency <- as.factor(data.b$sed_consistency) # Add the feature information
# Convert row names to a column
cca.species$Family <- rownames(cca.species)
# Merge with taxa_df using cleaned species names
cca_full_df <- merge(cca.species, taxa, by = "Family")
#Maybe add only significant species:
# Step 1: Run envfit to test significance of species with CCA axes
species_fit <- envfit(cca_model_family, SPE.hel3, perm = 999)
# Step 2: Extract species with significant p-values
significant_species <- species_fit$vectors$pvals < 0.01
significant_species_names <- rownames(species_fit$vectors$arrows)[significant_species]
# Step 3: Filter your species data (cca_full_df) to include only significant species
cca_full_df <- cca_full_df[cca_full_df$Family %in% significant_species_names, ]
# Plotting the biplot. I am using water mass as shape, since they were strongly significant. And also explained the highest variation in the model.
plot.cca_family <- ggplot(cca.sites, aes(x = CCA1, y = CCA2)) +
geom_point(aes(x = CCA1, y = CCA2,
col = site, shape = watermass, alpha = 0.9),
size = 3) +
scale_alpha(range = c(0, 1), guide = "none") + # Hide alpha from the legend
geom_hline(yintercept=0, linetype="dotted") +
geom_vline(xintercept=0, linetype="dotted") +
geom_segment(data = cca.biplot,
aes(x = 0, xend = CCA1*2, y = 0, yend = CCA2*2),
arrow=arrow(length=unit(0.01,"npc")),
color = "#C20000") +
geom_text(data = cca.biplot,
aes(x = CCA1*2.5, y = CCA2*2.5, label=rownames(cca.biplot)),
size = 4,
color = "#C20000") +
geom_text(data = cca_full_df, aes(x = CCA1, y = CCA2, label = Family),
position = position_jitter(width = 0.5, height = 0.5), size = 3, color = "black") + # Jitter text
labs(col = "Site", shape = "Water Mass or Current",
x = "CCA1",
y = "CCA2",
size = 5) +
theme_linedraw() +
theme(panel.grid = element_blank(),
legend.title = element_text(size = 10),
legend.text = element_text(size = 10),
axis.title.x = element_text (size = 14),
axis.title.y = element_text(size = 14),
legend.background = element_blank(),
legend.box.background= element_rect(colour="black"),
legend.position = "right")
plot.cca_family
#This needs to be run in console because there is prompt asking for the download of the underlying data (map), so not included in Markdown knitted file
library(ggOceanMaps)
#packageVersion("ggOceanMaps")
# Assuming your original data is stored in 'data'
data.map <- data.c %>%
group_by(Dive.Name) %>%
dplyr::summarize(
lat = mean(lat), # Or use a different method to get unique coordinates
lon = mean(lon),
feature = first(feature) # Keep the feature or other relevant data
)
# Transform coordinates using WGS84
transform_coord <- function(coords) {
return(data.frame(
lon2 = coords$lon,
lat2 = coords$lat
))
}
# Grouping and transforming data
data_transformed <- data.map %>%
group_by(Dive.Name) %>%
dplyr::summarize(
lat = mean(lat),
lon = mean(lon),
feature = first(feature)
) %>%
mutate(lon2 = lon, lat2 = lat)
# Plotting the map
# NOTE: if asked a question in console, click YES
#basemap(limits = c(-175, -129, 51, 61), bathy.style = 'poly_grays', crs = 4326) + #bathymetry=TRUE, bathy.style = 'rcb'
# geom_point(data = data_transformed, aes(x = lon2, y = lat2, color = factor(feature)), size = 3) +
# geom_text(data = data_transformed, aes(x = lon2, y = lat2, label = Dive.Name),
# hjust = 0, vjust = 1.5, size = 3, color = "black") +
# scale_color_viridis_d(name = "Feature") +
# ggspatial::annotation_scale(location = "br") +
# ggspatial::annotation_north_arrow(location = "tr", which_north = "true") +
# theme_minimal() +
# labs(title = "Dive Locations Map", x = "Longitude", y = "Latitude")
#Feature No: 1 canyon ; 2 ridge, 3 seamount, 4 seep, 5 slope